From 8d7bc088b150445b565a9a274e3804965281cff0 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 6 Nov 2024 14:00:45 +0800 Subject: [PATCH] seek_exact + cost based intersection Adds `seek_exact` and `cost` to `DocSet` for a more efficient intersection. Unlike `seek`, `seek_exact` does not require the DocSet to advance to the next hit, if the target does not exist. `cost` allows to address the different DocSet types and their cost model and is used to determine the DocSet that drives the intersection. E.g. fast field range queries may do a full scan. Phrase queries load the positions to check if a we have a hit. They both have a higher cost than their size_hint would suggest. Improves `size_hint` estimation for intersection and union, by having a estimation based on random distribution with a co-location factor. Refactor range query benchmark. Closes #2531 *Future Work* Implement `seek_exact` for BufferedUnionScorer and RangeDocSet (fast field range queries) Evaluate replacing `seek` with `seek_exact` to reduce code complexity --- Cargo.toml | 7 +- benches/range_query.rs | 260 ++++++++++ src/docset.rs | 55 +++ src/postings/mod.rs | 15 +- src/query/all_query.rs | 9 + src/query/boolean_query/block_wand.rs | 11 +- src/query/boolean_query/boolean_weight.rs | 43 +- src/query/boost_query.rs | 7 + src/query/const_score_query.rs | 4 + src/query/disjunction.rs | 22 + src/query/intersection.rs | 142 ++++-- src/query/mod.rs | 1 + .../phrase_prefix_scorer.rs | 12 + src/query/phrase_query/phrase_scorer.rs | 29 +- .../range_query/fast_field_range_doc_set.rs | 31 +- .../range_query/range_query_fastfield.rs | 446 ------------------ src/query/reqopt_scorer.rs | 9 + src/query/size_hint.rs | 138 ++++++ src/query/union/buffered_union.rs | 17 +- src/query/union/mod.rs | 8 + src/query/union/simple_union.rs | 5 + 21 files changed, 749 insertions(+), 522 deletions(-) create mode 100644 benches/range_query.rs create mode 100644 src/query/size_hint.rs diff --git a/Cargo.toml b/Cargo.toml index cb34078b0d..41ff260d36 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,7 +72,7 @@ fnv = "1.0.7" winapi = "0.3.9" [dev-dependencies] -binggan = "0.14.0" +binggan = "0.14.2" rand = "0.8.5" maplit = "1.0.2" matches = "0.1.9" @@ -162,3 +162,8 @@ harness = false [[bench]] name = "agg_bench" harness = false + +[[bench]] +name = "range_query" +harness = false + diff --git a/benches/range_query.rs b/benches/range_query.rs new file mode 100644 index 0000000000..255b35ef3f --- /dev/null +++ b/benches/range_query.rs @@ -0,0 +1,260 @@ +use std::fmt::Display; +use std::net::Ipv6Addr; +use std::ops::RangeInclusive; + +use binggan::plugins::PeakMemAllocPlugin; +use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM}; +use columnar::MonotonicallyMappableToU128; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tantivy::collector::{Count, TopDocs}; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::{doc, Index}; + +#[global_allocator] +pub static GLOBAL: &PeakMemAlloc = &INSTRUMENTED_SYSTEM; + +fn main() { + bench_range_query(); +} + +fn bench_range_query() { + let index = get_index_0_to_100(); + let mut runner = BenchRunner::new(); + runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL)); + + runner.set_name("range_query on u64"); + let field_name_and_descr: Vec<_> = vec![ + ("id", "Single Valued Range Field"), + ("ids", "Multi Valued Range Field"), + ]; + let range_num_hits = vec![ + ("90_percent", get_90_percent()), + ("10_percent", get_10_percent()), + ("1_percent", get_1_percent()), + ]; + + test_range(&mut runner, &index, &field_name_and_descr, range_num_hits); + + runner.set_name("range_query on ip"); + let field_name_and_descr: Vec<_> = vec![ + ("ip", "Single Valued Range Field"), + ("ips", "Multi Valued Range Field"), + ]; + let range_num_hits = vec![ + ("90_percent", get_90_percent_ip()), + ("10_percent", get_10_percent_ip()), + ("1_percent", get_1_percent_ip()), + ]; + + test_range(&mut runner, &index, &field_name_and_descr, range_num_hits); +} + +fn test_range( + runner: &mut BenchRunner, + index: &Index, + field_name_and_descr: &[(&str, &str)], + range_num_hits: Vec<(&str, RangeInclusive)>, +) { + for (field, suffix) in field_name_and_descr { + let term_num_hits = vec![ + ("", ""), + ("1_percent", "veryfew"), + ("10_percent", "few"), + ("90_percent", "most"), + ]; + let mut group = runner.new_group(); + group.set_name(suffix); + // all intersect combinations + for (range_name, range) in &range_num_hits { + for (term_name, term) in &term_num_hits { + let index = &index; + let test_name = if term_name.is_empty() { + format!("id_range_hit_{}", range_name) + } else { + format!( + "id_range_hit_{}_intersect_with_term_{}", + range_name, term_name + ) + }; + group.register(test_name, move |_| { + let query = if term_name.is_empty() { + "".to_string() + } else { + format!("AND id_name:{}", term) + }; + black_box(execute_query(field, range, &query, index)); + }); + } + } + group.run(); + } +} + +fn get_index_0_to_100() -> Index { + let mut rng = StdRng::from_seed([1u8; 32]); + let num_vals = 100_000; + let docs: Vec<_> = (0..num_vals) + .map(|_i| { + let id_name = if rng.gen_bool(0.01) { + "veryfew".to_string() // 1% + } else if rng.gen_bool(0.1) { + "few".to_string() // 9% + } else { + "most".to_string() // 90% + }; + Doc { + id_name, + id: rng.gen_range(0..100), + // Multiply by 1000, so that we create most buckets in the compact space + // The benches depend on this range to select n-percent of elements with the + // methods below. + ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000), + } + }) + .collect(); + + create_index_from_docs(&docs) +} + +#[derive(Clone, Debug)] +pub struct Doc { + pub id_name: String, + pub id: u64, + pub ip: Ipv6Addr, +} + +pub fn create_index_from_docs(docs: &[Doc]) -> Index { + let mut schema_builder = Schema::builder(); + let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST); + let ids_u64_field = + schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed()); + + let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST); + let ids_f64_field = schema_builder.add_f64_field( + "ids_f64", + NumericOptions::default().set_fast().set_indexed(), + ); + + let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST); + let ids_i64_field = schema_builder.add_i64_field( + "ids_i64", + NumericOptions::default().set_fast().set_indexed(), + ); + + let text_field = schema_builder.add_text_field("id_name", STRING | STORED); + let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST); + + let ip_field = schema_builder.add_ip_addr_field("ip", FAST); + let ips_field = schema_builder.add_ip_addr_field("ips", FAST); + + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + for doc in docs.iter() { + index_writer + .add_document(doc!( + ids_i64_field => doc.id as i64, + ids_i64_field => doc.id as i64, + ids_f64_field => doc.id as f64, + ids_f64_field => doc.id as f64, + ids_u64_field => doc.id, + ids_u64_field => doc.id, + id_u64_field => doc.id, + id_f64_field => doc.id as f64, + id_i64_field => doc.id as i64, + text_field => doc.id_name.to_string(), + text_field2 => doc.id_name.to_string(), + ips_field => doc.ip, + ips_field => doc.ip, + ip_field => doc.ip, + )) + .unwrap(); + } + + index_writer.commit().unwrap(); + } + index +} + +fn get_90_percent() -> RangeInclusive { + 0..=90 +} + +fn get_10_percent() -> RangeInclusive { + 0..=10 +} + +fn get_1_percent() -> RangeInclusive { + 10..=10 +} + +fn get_90_percent_ip() -> RangeInclusive { + let start = Ipv6Addr::from_u128(0); + let end = Ipv6Addr::from_u128(90 * 1000); + start..=end +} + +fn get_10_percent_ip() -> RangeInclusive { + let start = Ipv6Addr::from_u128(0); + let end = Ipv6Addr::from_u128(10 * 1000); + start..=end +} + +fn get_1_percent_ip() -> RangeInclusive { + let start = Ipv6Addr::from_u128(10 * 1000); + let end = Ipv6Addr::from_u128(10 * 1000); + start..=end +} + +struct NumHits { + count: usize, +} +impl OutputValue for NumHits { + fn column_title() -> &'static str { + "NumHits" + } + fn format(&self) -> Option { + Some(self.count.to_string()) + } +} + +fn execute_query( + field: &str, + id_range: &RangeInclusive, + suffix: &str, + index: &Index, +) -> NumHits { + let gen_query_inclusive = |from: &T, to: &T| { + format!( + "{}:[{} TO {}] {}", + field, + &from.to_string(), + &to.to_string(), + suffix + ) + }; + + let query = gen_query_inclusive(id_range.start(), id_range.end()); + execute_query_(&query, index) +} + +fn execute_query_(query: &str, index: &Index) -> NumHits { + let query_from_text = |text: &str| { + QueryParser::for_index(index, vec![]) + .parse_query(text) + .unwrap() + }; + let query = query_from_text(query); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let num_hits = searcher + .search(&query, &(TopDocs::with_limit(10), Count)) + .unwrap() + .1; + NumHits { count: num_hits } +} diff --git a/src/docset.rs b/src/docset.rs index 96ea307099..046a9c3c7f 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -49,6 +49,26 @@ pub trait DocSet: Send { doc } + /// Seeks to the target if possible and checks if the target is an exact match. + /// + /// Implementations may choose to advance past the target if target does not exist. + /// The return value is `true` if the docset contains target, and `false` otherwise. + /// + /// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`. + /// All wapper DocSets should forward `seek_exact` to the underlying DocSet. + /// + /// ## API Behaviour + /// If `seek_exact` is returning true, a call to `doc()` has to return target. + /// If `seek_exact` is returning false, a call to `doc()` may return the previous doc, + /// which may be lower than target. + fn seek_exact(&mut self, target: DocId) -> bool { + let current_doc = self.doc(); + if current_doc < target { + self.seek(target); + } + self.doc() == target + } + /// Fills a given mutable buffer with the next doc ids from the /// `DocSet` /// @@ -87,6 +107,23 @@ pub trait DocSet: Send { /// length of the docset. fn size_hint(&self) -> u32; + /// Returns a best-effort hint of the + /// cost to drive the docset. + /// + /// By default this returns `size_hint()`. + /// + /// DocSets may have vastly different cost depending on their type, + /// e.g. an intersection with 10 hits is much cheaper than + /// a phrase search with 10 hits, since it needs to load positions. + /// + /// ### Future Work + /// We may want to differentiate `DocSet` costs more more granular, e.g. + /// creation_cost, advance_cost, seek_cost on to get a good estimation + /// what query types to choose. + fn cost(&self) -> u64 { + self.size_hint() as u64 + } + /// Returns the number documents matching. /// Calling this method consumes the `DocSet`. fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { @@ -126,6 +163,10 @@ impl DocSet for &mut dyn DocSet { (**self).seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + (**self).seek_exact(target) + } + fn doc(&self) -> u32 { (**self).doc() } @@ -134,6 +175,10 @@ impl DocSet for &mut dyn DocSet { (**self).size_hint() } + fn cost(&self) -> u64 { + (**self).cost() + } + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { (**self).count(alive_bitset) } @@ -154,6 +199,11 @@ impl DocSet for Box { unboxed.seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + let unboxed: &mut TDocSet = self.borrow_mut(); + unboxed.seek_exact(target) + } + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.fill_buffer(buffer) @@ -169,6 +219,11 @@ impl DocSet for Box { unboxed.size_hint() } + fn cost(&self) -> u64 { + let unboxed: &TDocSet = self.borrow(); + unboxed.cost() + } + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.count(alive_bitset) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4dee06aad5..efc0e069dc 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -667,12 +667,15 @@ mod bench { .read_postings(&TERM_D, IndexRecordOption::Basic) .unwrap() .unwrap(); - let mut intersection = Intersection::new(vec![ - segment_postings_a, - segment_postings_b, - segment_postings_c, - segment_postings_d, - ]); + let mut intersection = Intersection::new( + vec![ + segment_postings_a, + segment_postings_b, + segment_postings_c, + segment_postings_d, + ], + reader.searcher().num_docs() as u32, + ); while intersection.advance() != TERMINATED {} }); } diff --git a/src/query/all_query.rs b/src/query/all_query.rs index 11172f9ed3..7a03cda8a8 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -58,6 +58,15 @@ impl DocSet for AllScorer { self.doc } + fn seek(&mut self, target: DocId) -> DocId { + debug_assert!(target >= self.doc); + self.doc = target; + if self.doc >= self.max_doc { + self.doc = TERMINATED; + } + self.doc + } + fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { if self.doc() == TERMINATED { return 0; diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 77febab31c..b16455d240 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -368,10 +368,14 @@ mod tests { checkpoints } - fn compute_checkpoints_manual(term_scorers: Vec, n: usize) -> Vec<(DocId, Score)> { + fn compute_checkpoints_manual( + term_scorers: Vec, + n: usize, + max_doc: u32, + ) -> Vec<(DocId, Score)> { let mut heap: BinaryHeap = BinaryHeap::with_capacity(n); let mut checkpoints: Vec<(DocId, Score)> = Vec::new(); - let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default); + let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default, max_doc); let mut limit = Score::MIN; loop { @@ -479,7 +483,8 @@ mod tests { for top_k in 1..4 { let checkpoints_for_each_pruning = compute_checkpoints_for_each_pruning(term_scorers.clone(), top_k); - let checkpoints_manual = compute_checkpoints_manual(term_scorers.clone(), top_k); + let checkpoints_manual = + compute_checkpoints_manual(term_scorers.clone(), top_k, max_doc as u32); assert_eq!(checkpoints_for_each_pruning.len(), checkpoints_manual.len()); for (&(left_doc, left_score), &(right_doc, right_score)) in checkpoints_for_each_pruning .iter() diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 7b617866fe..f01c533ff0 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -42,6 +42,7 @@ where fn scorer_union( scorers: Vec>, score_combiner_fn: impl Fn() -> TScoreCombiner, + num_docs: u32, ) -> SpecializedScorer where TScoreCombiner: ScoreCombiner, @@ -68,6 +69,7 @@ where return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( scorers, score_combiner_fn, + num_docs, ))); } } @@ -75,16 +77,19 @@ where SpecializedScorer::Other(Box::new(BufferedUnionScorer::build( scorers, score_combiner_fn, + num_docs, ))) } fn into_box_scorer( scorer: SpecializedScorer, score_combiner_fn: impl Fn() -> TScoreCombiner, + num_docs: u32, ) -> Box { match scorer { SpecializedScorer::TermUnion(term_scorers) => { - let union_scorer = BufferedUnionScorer::build(term_scorers, score_combiner_fn); + let union_scorer = + BufferedUnionScorer::build(term_scorers, score_combiner_fn, num_docs); Box::new(union_scorer) } SpecializedScorer::Other(scorer) => scorer, @@ -151,6 +156,7 @@ impl BooleanWeight { boost: Score, score_combiner_fn: impl Fn() -> TComplexScoreCombiner, ) -> crate::Result { + let num_docs = reader.num_docs(); let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?; // Indicate how should clauses are combined with other clauses. enum CombinationMethod { @@ -168,10 +174,15 @@ impl BooleanWeight { return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))); } match self.minimum_number_should_match { - 0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)), + 0 => CombinationMethod::Optional(scorer_union( + should_scorers, + &score_combiner_fn, + num_docs, + )), 1 => CombinationMethod::Required(into_box_scorer( - scorer_union(should_scorers, &score_combiner_fn), + scorer_union(should_scorers, &score_combiner_fn, num_docs), &score_combiner_fn, + num_docs, )), n if num_of_should_scorers == n => { // When num_of_should_scorers equals the number of should clauses, @@ -201,21 +212,21 @@ impl BooleanWeight { }; let exclude_scorer_opt: Option> = per_occur_scorers .remove(&Occur::MustNot) - .map(|scorers| scorer_union(scorers, DoNothingCombiner::default)) + .map(|scorers| scorer_union(scorers, DoNothingCombiner::default, num_docs)) .map(|specialized_scorer: SpecializedScorer| { - into_box_scorer(specialized_scorer, DoNothingCombiner::default) + into_box_scorer(specialized_scorer, DoNothingCombiner::default, num_docs) }); let positive_scorer = match (should_opt, must_scorers) { (CombinationMethod::Ignored, Some(must_scorers)) => { - SpecializedScorer::Other(intersect_scorers(must_scorers)) + SpecializedScorer::Other(intersect_scorers(must_scorers, num_docs)) } (CombinationMethod::Optional(should_scorer), Some(must_scorers)) => { - let must_scorer = intersect_scorers(must_scorers); + let must_scorer = intersect_scorers(must_scorers, num_docs); if self.scoring_enabled { SpecializedScorer::Other(Box::new( RequiredOptionalScorer::<_, _, TScoreCombiner>::new( must_scorer, - into_box_scorer(should_scorer, &score_combiner_fn), + into_box_scorer(should_scorer, &score_combiner_fn, num_docs), ), )) } else { @@ -224,7 +235,7 @@ impl BooleanWeight { } (CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => { must_scorers.push(should_scorer); - SpecializedScorer::Other(intersect_scorers(must_scorers)) + SpecializedScorer::Other(intersect_scorers(must_scorers, num_docs)) } (CombinationMethod::Ignored, None) => { return Ok(SpecializedScorer::Other(Box::new(EmptyScorer))) @@ -236,7 +247,8 @@ impl BooleanWeight { (CombinationMethod::Optional(should_scorer), None) => should_scorer, }; if let Some(exclude_scorer) = exclude_scorer_opt { - let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn); + let positive_scorer_boxed = + into_box_scorer(positive_scorer, &score_combiner_fn, num_docs); Ok(SpecializedScorer::Other(Box::new(Exclude::new( positive_scorer_boxed, exclude_scorer, @@ -249,6 +261,7 @@ impl BooleanWeight { impl Weight for BooleanWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + let num_docs = reader.num_docs(); if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) } else if self.weights.len() == 1 { @@ -261,12 +274,12 @@ impl Weight for BooleanWeight Weight for BooleanWeight crate::Result<()> { + let num_docs = reader.num_docs(); let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?; match scorer { SpecializedScorer::TermUnion(term_scorers) => { let mut union_scorer = - BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn); + BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs); for_each_scorer(&mut union_scorer, callback); } SpecializedScorer::Other(mut scorer) => { @@ -315,13 +329,14 @@ impl Weight for BooleanWeight crate::Result<()> { + let num_docs = reader.num_docs(); let scorer = self.complex_scorer(reader, 1.0, || DoNothingCombiner)?; let mut buffer = [0u32; COLLECT_BLOCK_BUFFER_LEN]; match scorer { SpecializedScorer::TermUnion(term_scorers) => { let mut union_scorer = - BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn); + BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn, num_docs); for_each_docset_buffered(&mut union_scorer, &mut buffer, callback); } SpecializedScorer::Other(mut scorer) => { diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index 4d2352d4d6..da442b323c 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -104,6 +104,9 @@ impl DocSet for BoostScorer { fn seek(&mut self, target: DocId) -> DocId { self.underlying.seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + self.underlying.seek_exact(target) + } fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { self.underlying.fill_buffer(buffer) @@ -117,6 +120,10 @@ impl DocSet for BoostScorer { self.underlying.size_hint() } + fn cost(&self) -> u64 { + self.underlying.cost() + } + fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { self.underlying.count(alive_bitset) } diff --git a/src/query/const_score_query.rs b/src/query/const_score_query.rs index 8f27b82857..570c7fecae 100644 --- a/src/query/const_score_query.rs +++ b/src/query/const_score_query.rs @@ -130,6 +130,10 @@ impl DocSet for ConstScorer { fn size_hint(&self) -> u32 { self.docset.size_hint() } + + fn cost(&self) -> u64 { + self.docset.cost() + } } impl Scorer for ConstScorer { diff --git a/src/query/disjunction.rs b/src/query/disjunction.rs index 81723af9a7..f34394e0fd 100644 --- a/src/query/disjunction.rs +++ b/src/query/disjunction.rs @@ -62,6 +62,16 @@ impl DocSet for ScorerWrapper { self.current_doc = doc_id; doc_id } + fn seek(&mut self, target: DocId) -> DocId { + let doc_id = self.scorer.seek(target); + self.current_doc = doc_id; + doc_id + } + fn seek_exact(&mut self, target: DocId) -> bool { + let found = self.scorer.seek_exact(target); + self.current_doc = self.scorer.doc(); + found + } fn doc(&self) -> DocId { self.current_doc @@ -70,6 +80,10 @@ impl DocSet for ScorerWrapper { fn size_hint(&self) -> u32 { self.scorer.size_hint() } + + fn cost(&self) -> u64 { + self.scorer.cost() + } } impl Disjunction { @@ -146,6 +160,14 @@ impl DocSet .max() .unwrap_or(0u32) } + + fn cost(&self) -> u64 { + self.chains + .iter() + .map(|docset| docset.cost()) + .max() + .unwrap_or(0u64) + } } impl Scorer diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 61c8ca8f36..01a435b43b 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,3 +1,4 @@ +use super::size_hint::estimate_intersection; use crate::docset::{DocSet, TERMINATED}; use crate::query::term_query::TermScorer; use crate::query::{EmptyScorer, Scorer}; @@ -11,14 +12,14 @@ use crate::{DocId, Score}; /// For better performance, the function uses a /// specialized implementation if the two /// shortest scorers are `TermScorer`s. -pub fn intersect_scorers(mut scorers: Vec>) -> Box { +pub fn intersect_scorers(mut scorers: Vec>, num_docs: u32) -> Box { if scorers.is_empty() { return Box::new(EmptyScorer); } if scorers.len() == 1 { return scorers.pop().unwrap(); } - scorers.sort_by_key(|scorer| scorer.size_hint()); + scorers.sort_by_key(|scorer| scorer.cost()); let doc = go_to_first_doc(&mut scorers[..]); if doc == TERMINATED { return Box::new(EmptyScorer); @@ -34,12 +35,14 @@ pub fn intersect_scorers(mut scorers: Vec>) -> Box { left: *(left.downcast::().map_err(|_| ()).unwrap()), right: *(right.downcast::().map_err(|_| ()).unwrap()), others: scorers, + num_docs, }); } Box::new(Intersection { left, right, others: scorers, + num_docs, }) } @@ -48,6 +51,7 @@ pub struct Intersection> left: TDocSet, right: TDocSet, others: Vec, + num_docs: u32, } fn go_to_first_doc(docsets: &mut [TDocSet]) -> DocId { @@ -66,7 +70,7 @@ fn go_to_first_doc(docsets: &mut [TDocSet]) -> DocId { } impl Intersection { - pub(crate) fn new(mut docsets: Vec) -> Intersection { + pub(crate) fn new(mut docsets: Vec, num_docs: u32) -> Intersection { let num_docsets = docsets.len(); assert!(num_docsets >= 2); docsets.sort_by_key(|docset| docset.size_hint()); @@ -77,6 +81,7 @@ impl Intersection { left, right, others: docsets, + num_docs, } } } @@ -95,32 +100,39 @@ impl DocSet for Intersection DocId { let (left, right) = (&mut self.left, &mut self.right); let mut candidate = left.advance(); + if candidate == TERMINATED { + return TERMINATED; + } - 'outer: loop { + loop { // In the first part we look for a document in the intersection // of the two rarest `DocSet` in the intersection. loop { - let right_doc = right.seek(candidate); - candidate = left.seek(right_doc); - if candidate == right_doc { + if right.seek_exact(candidate) { break; } + // `left.advance().max(right.doc())` yielded a regression in the search game + // benchmark It may make sense in certain scenarios though. + candidate = left.advance(); + if candidate == TERMINATED { + return TERMINATED; + } } debug_assert_eq!(left.doc(), right.doc()); - // test the remaining scorers; - for docset in self.others.iter_mut() { - let seek_doc = docset.seek(candidate); - if seek_doc > candidate { - candidate = left.seek(seek_doc); - continue 'outer; - } + // test the remaining scorers + if self + .others + .iter_mut() + .all(|docset| docset.seek_exact(candidate)) + { + debug_assert_eq!(candidate, self.left.doc()); + debug_assert_eq!(candidate, self.right.doc()); + debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate)); + return candidate; } - debug_assert_eq!(candidate, self.left.doc()); - debug_assert_eq!(candidate, self.right.doc()); - debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate)); - return candidate; + candidate = left.advance(); } } @@ -136,12 +148,37 @@ impl DocSet for Intersection bool { + self.left.seek_exact(target) + && self.right.seek_exact(target) + && self + .others + .iter_mut() + .all(|docset| docset.seek_exact(target)) + } + fn doc(&self) -> DocId { self.left.doc() } fn size_hint(&self) -> u32 { - self.left.size_hint() + estimate_intersection( + [self.left.size_hint(), self.right.size_hint()] + .into_iter() + .chain(self.others.iter().map(DocSet::size_hint)), + self.num_docs, + ) + } + + fn cost(&self) -> u64 { + // What's the best way to compute the cost of an intersection? + // For now we take the cost of the docset driver, which is the first docset. + // If there are docsets that are bad at skipping, they should also influence the cost. + self.left.cost() } } @@ -159,6 +196,8 @@ where #[cfg(test)] mod tests { + use proptest::prelude::*; + use super::Intersection; use crate::docset::{DocSet, TERMINATED}; use crate::postings::tests::test_skip_against_unoptimized; @@ -169,7 +208,7 @@ mod tests { { let left = VecDocSet::from(vec![1, 3, 9]); let right = VecDocSet::from(vec![3, 4, 9, 18]); - let mut intersection = Intersection::new(vec![left, right]); + let mut intersection = Intersection::new(vec![left, right], 10); assert_eq!(intersection.doc(), 3); assert_eq!(intersection.advance(), 9); assert_eq!(intersection.doc(), 9); @@ -179,7 +218,7 @@ mod tests { let a = VecDocSet::from(vec![1, 3, 9]); let b = VecDocSet::from(vec![3, 4, 9, 18]); let c = VecDocSet::from(vec![1, 5, 9, 111]); - let mut intersection = Intersection::new(vec![a, b, c]); + let mut intersection = Intersection::new(vec![a, b, c], 10); assert_eq!(intersection.doc(), 9); assert_eq!(intersection.advance(), TERMINATED); } @@ -189,7 +228,7 @@ mod tests { fn test_intersection_zero() { let left = VecDocSet::from(vec![0]); let right = VecDocSet::from(vec![0]); - let mut intersection = Intersection::new(vec![left, right]); + let mut intersection = Intersection::new(vec![left, right], 10); assert_eq!(intersection.doc(), 0); assert_eq!(intersection.advance(), TERMINATED); } @@ -198,7 +237,7 @@ mod tests { fn test_intersection_skip() { let left = VecDocSet::from(vec![0, 1, 2, 4]); let right = VecDocSet::from(vec![2, 5]); - let mut intersection = Intersection::new(vec![left, right]); + let mut intersection = Intersection::new(vec![left, right], 10); assert_eq!(intersection.seek(2), 2); assert_eq!(intersection.doc(), 2); } @@ -209,7 +248,7 @@ mod tests { || { let left = VecDocSet::from(vec![4]); let right = VecDocSet::from(vec![2, 5]); - Box::new(Intersection::new(vec![left, right])) + Box::new(Intersection::new(vec![left, right], 10)) }, vec![0, 2, 4, 5, 6], ); @@ -219,19 +258,22 @@ mod tests { let mut right = VecDocSet::from(vec![2, 5, 10]); left.advance(); right.advance(); - Box::new(Intersection::new(vec![left, right])) + Box::new(Intersection::new(vec![left, right], 10)) }, vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11], ); test_skip_against_unoptimized( || { - Box::new(Intersection::new(vec![ - VecDocSet::from(vec![1, 4, 5, 6]), - VecDocSet::from(vec![1, 2, 5, 6]), - VecDocSet::from(vec![1, 4, 5, 6]), - VecDocSet::from(vec![1, 5, 6]), - VecDocSet::from(vec![2, 4, 5, 7, 8]), - ])) + Box::new(Intersection::new( + vec![ + VecDocSet::from(vec![1, 4, 5, 6]), + VecDocSet::from(vec![1, 2, 5, 6]), + VecDocSet::from(vec![1, 4, 5, 6]), + VecDocSet::from(vec![1, 5, 6]), + VecDocSet::from(vec![2, 4, 5, 7, 8]), + ], + 10, + )) }, vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11], ); @@ -242,7 +284,41 @@ mod tests { let a = VecDocSet::from(vec![1, 3]); let b = VecDocSet::from(vec![1, 4]); let c = VecDocSet::from(vec![3, 9]); - let intersection = Intersection::new(vec![a, b, c]); + let intersection = Intersection::new(vec![a, b, c], 10); assert_eq!(intersection.doc(), TERMINATED); } + + // Strategy to generate sorted and deduplicated vectors of u32 document IDs + fn sorted_deduped_vec(max_val: u32, max_size: usize) -> impl Strategy> { + prop::collection::vec(0..max_val, 0..max_size).prop_map(|mut vec| { + vec.sort(); + vec.dedup(); + vec + }) + } + + proptest! { + #[test] + fn prop_test_intersection_consistency( + a in sorted_deduped_vec(100, 10), + b in sorted_deduped_vec(100, 10), + num_docs in 100u32..500u32 + ) { + let left = VecDocSet::from(a.clone()); + let right = VecDocSet::from(b.clone()); + let mut intersection = Intersection::new(vec![left, right], num_docs); + + let expected: Vec = a.iter() + .cloned() + .filter(|doc| b.contains(doc)) + .collect(); + + for expected_doc in expected { + assert_eq!(intersection.doc(), expected_doc); + intersection.advance(); + } + assert_eq!(intersection.doc(), TERMINATED); + } + + } } diff --git a/src/query/mod.rs b/src/query/mod.rs index 23e64f1894..c343c754b2 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -23,6 +23,7 @@ mod regex_query; mod reqopt_scorer; mod scorer; mod set_query; +mod size_hint; mod term_query; mod union; mod weight; diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs index 09cf6c5bdf..aa6f26d265 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -193,6 +193,14 @@ impl DocSet for PhrasePrefixScorer { self.advance() } + fn seek_exact(&mut self, target: DocId) -> bool { + if self.phrase_scorer.seek_exact(target) { + self.matches_prefix() + } else { + false + } + } + fn doc(&self) -> DocId { self.phrase_scorer.doc() } @@ -200,6 +208,10 @@ impl DocSet for PhrasePrefixScorer { fn size_hint(&self) -> u32 { self.phrase_scorer.size_hint() } + + fn cost(&self) -> u64 { + self.phrase_scorer.cost() + } } impl Scorer for PhrasePrefixScorer { diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 5c67f4e27b..a41dadc823 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -368,6 +368,7 @@ impl PhraseScorer { slop: u32, offset: usize, ) -> PhraseScorer { + let num_docs = fieldnorm_reader.num_docs(); let max_offset = term_postings_with_offset .iter() .map(|&(offset, _)| offset) @@ -381,8 +382,9 @@ impl PhraseScorer { PostingsWithOffset::new(postings, (max_offset - offset) as u32) }) .collect::>(); + let intersection_docset = Intersection::new(postings_with_offsets, num_docs); let mut scorer = PhraseScorer { - intersection_docset: Intersection::new(postings_with_offsets), + intersection_docset, num_terms: num_docsets, left_positions: Vec::with_capacity(100), right_positions: Vec::with_capacity(100), @@ -528,12 +530,35 @@ impl DocSet for PhraseScorer { self.advance() } + fn seek_exact(&mut self, target: DocId) -> bool { + debug_assert!(target >= self.doc()); + if self.intersection_docset.seek_exact(target) && self.phrase_match() { + return true; + } + false + } + fn doc(&self) -> DocId { self.intersection_docset.doc() } fn size_hint(&self) -> u32 { - self.intersection_docset.size_hint() + // We adjust the intersection estimate, since actual phrase hits are much lower than where + // the all appear. + // The estimate should depend on average field length, e.g. if the field is really short + // a phrase hit is more likely + self.intersection_docset.size_hint() / (10 * self.num_terms as u32) + } + + /// Returns a best-effort hint of the + /// cost to drive the docset. + fn cost(&self) -> u64 { + // While determing a potential hit is cheap for phrases, evaluating an actual hit is + // expensive since it requires to load positions for a doc and check if they are next to + // each other. + // So the cost estimation would be the number of times we need to check if a doc is a hit * + // 10 * self.num_terms. + self.intersection_docset.size_hint() as u64 * 10 * self.num_terms as u64 } } diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index 779269069b..b9bc25d6f8 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -81,6 +81,9 @@ impl RangeDocSet { /// Returns true if more data could be fetched fn fetch_block(&mut self) { + if self.next_fetch_start >= self.column.num_docs() { + return; + } const MAX_HORIZON: u32 = 100_000; while self.loaded_docs.is_empty() { let finished_to_end = self.fetch_horizon(self.fetch_horizon); @@ -105,10 +108,10 @@ impl RangeDocSet { fn fetch_horizon(&mut self, horizon: u32) -> bool { let mut finished_to_end = false; - let limit = self.column.num_docs(); - let mut end = self.next_fetch_start + horizon; - if end >= limit { - end = limit; + let num_docs = self.column.num_docs(); + let mut fetch_end = self.next_fetch_start + horizon; + if fetch_end >= num_docs { + fetch_end = num_docs; finished_to_end = true; } @@ -116,7 +119,7 @@ impl RangeDocSet { let doc_buffer: &mut Vec = self.loaded_docs.get_cleared_data(); self.column.get_docids_for_value_range( self.value_range.clone(), - self.next_fetch_start..end, + self.next_fetch_start..fetch_end, doc_buffer, ); if let Some(last_doc) = last_doc { @@ -124,7 +127,7 @@ impl RangeDocSet { self.loaded_docs.next(); } } - self.next_fetch_start = end; + self.next_fetch_start = fetch_end; finished_to_end } @@ -136,9 +139,6 @@ impl DocSet for RangeDocSe if let Some(docid) = self.loaded_docs.next() { return docid; } - if self.next_fetch_start >= self.column.num_docs() { - return TERMINATED; - } self.fetch_block(); self.loaded_docs.current().unwrap_or(TERMINATED) } @@ -174,7 +174,18 @@ impl DocSet for RangeDocSe } fn size_hint(&self) -> u32 { - self.column.num_docs() + // TODO: Implement a better size hint + self.column.num_docs() / 10 + } + + /// Returns a best-effort hint of the + /// cost to drive the docset. + fn cost(&self) -> u64 { + // Advancing the docset is pretty expensive since it scans the whole column, there is no + // index currently (will change with an kd-tree) + // Since we use SIMD to scan the fast field range query we lower the cost a little bit. + // Ideally this would take the fast field codec into account + (self.column.num_docs() as f64 * 0.8) as u64 } } diff --git a/src/query/range_query/range_query_fastfield.rs b/src/query/range_query/range_query_fastfield.rs index ef34915e89..92bde8d895 100644 --- a/src/query/range_query/range_query_fastfield.rs +++ b/src/query/range_query/range_query_fastfield.rs @@ -1579,449 +1579,3 @@ pub(crate) mod ip_range_tests { Ok(()) } } - -#[cfg(all(test, feature = "unstable"))] -mod bench { - - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use test::Bencher; - - use super::tests::*; - use super::*; - use crate::collector::Count; - use crate::query::QueryParser; - use crate::Index; - - fn get_index_0_to_100() -> Index { - let mut rng = StdRng::from_seed([1u8; 32]); - let num_vals = 100_000; - let docs: Vec<_> = (0..num_vals) - .map(|_i| { - let id_name = if rng.gen_bool(0.01) { - "veryfew".to_string() // 1% - } else if rng.gen_bool(0.1) { - "few".to_string() // 9% - } else { - "many".to_string() // 90% - }; - Doc { - id_name, - id: rng.gen_range(0..100), - } - }) - .collect(); - - create_index_from_docs(&docs, false) - } - - fn get_90_percent() -> RangeInclusive { - 0..=90 - } - - fn get_10_percent() -> RangeInclusive { - 0..=10 - } - - fn get_1_percent() -> RangeInclusive { - 10..=10 - } - - fn execute_query( - field: &str, - id_range: RangeInclusive, - suffix: &str, - index: &Index, - ) -> usize { - let gen_query_inclusive = |from: &u64, to: &u64| { - format!( - "{}:[{} TO {}] {}", - field, - &from.to_string(), - &to.to_string(), - suffix - ) - }; - - let query = gen_query_inclusive(id_range.start(), id_range.end()); - let query_from_text = |text: &str| { - QueryParser::for_index(index, vec![]) - .parse_query(text) - .unwrap() - }; - let query = query_from_text(&query); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - searcher.search(&query, &(Count)).unwrap() - } - - #[bench] - fn bench_id_range_hit_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:veryfew", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:veryfew", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:veryfew", &index)); - } - - #[bench] - fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:many", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:few", &index)); - } - - #[bench] - fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index)); - } -} - -#[cfg(all(test, feature = "unstable"))] -mod bench_ip { - - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - use test::Bencher; - - use super::ip_range_tests::*; - use super::*; - use crate::collector::Count; - use crate::query::QueryParser; - use crate::Index; - - fn get_index_0_to_100() -> Index { - let mut rng = StdRng::from_seed([1u8; 32]); - let num_vals = 100_000; - let docs: Vec<_> = (0..num_vals) - .map(|_i| { - let id = if rng.gen_bool(0.01) { - "veryfew".to_string() // 1% - } else if rng.gen_bool(0.1) { - "few".to_string() // 9% - } else { - "many".to_string() // 90% - }; - Doc { - id, - // Multiply by 1000, so that we create many buckets in the compact space - // The benches depend on this range to select n-percent of elements with the - // methods below. - ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000), - } - }) - .collect(); - - create_index_from_ip_docs(&docs) - } - - fn get_90_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(0); - let end = Ipv6Addr::from_u128(90 * 1000); - start..=end - } - - fn get_10_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(0); - let end = Ipv6Addr::from_u128(10 * 1000); - start..=end - } - - fn get_1_percent() -> RangeInclusive { - let start = Ipv6Addr::from_u128(10 * 1000); - let end = Ipv6Addr::from_u128(10 * 1000); - start..=end - } - - fn execute_query( - field: &str, - ip_range: RangeInclusive, - suffix: &str, - index: &Index, - ) -> usize { - let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| { - format!( - "{}:[{} TO {}] {}", - field, - &from.to_string(), - &to.to_string(), - suffix - ) - }; - - let query = gen_query_inclusive(ip_range.start(), ip_range.end()); - let query_from_text = |text: &str| { - QueryParser::for_index(index, vec![]) - .parse_query(text) - .unwrap() - }; - let query = query_from_text(&query); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - searcher.search(&query, &(Count)).unwrap() - } - - #[bench] - fn bench_ip_range_hit_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_10_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_1_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_10_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ip", get_90_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_10_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_1_percent(), "", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_10_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_1_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - bench.iter(|| execute_query("ips", get_1_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_1_percent(), "AND id:veryfew", &index)); - } - - #[bench] - fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_10_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "AND id:many", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "AND id:few", &index)); - } - - #[bench] - fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) { - let index = get_index_0_to_100(); - - bench.iter(|| execute_query("ips", get_90_percent(), "AND id:veryfew", &index)); - } -} diff --git a/src/query/reqopt_scorer.rs b/src/query/reqopt_scorer.rs index 3a4ae61b9c..1217eb65af 100644 --- a/src/query/reqopt_scorer.rs +++ b/src/query/reqopt_scorer.rs @@ -56,6 +56,11 @@ where self.req_scorer.seek(target) } + fn seek_exact(&mut self, target: DocId) -> bool { + self.score_cache = None; + self.req_scorer.seek_exact(target) + } + fn doc(&self) -> DocId { self.req_scorer.doc() } @@ -63,6 +68,10 @@ where fn size_hint(&self) -> u32 { self.req_scorer.size_hint() } + + fn cost(&self) -> u64 { + self.req_scorer.cost() + } } impl Scorer diff --git a/src/query/size_hint.rs b/src/query/size_hint.rs new file mode 100644 index 0000000000..9629bf69c4 --- /dev/null +++ b/src/query/size_hint.rs @@ -0,0 +1,138 @@ +/// Computes the estimated number of documents in the intersection of multiple docsets +/// given their sizes. +/// +/// # Arguments +/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set). +/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the +/// segment. +/// +/// # Returns +/// The estimated number of documents in the intersection. +pub fn estimate_intersection(mut docset_sizes: I, max_docs: u32) -> u32 +where I: Iterator { + // Terms tend to be not really randomly distributed. + // This factor is used to adjust the estimate. + let mut co_loc_factor: f64 = 1.3; + + let mut intersection_estimate = match docset_sizes.next() { + Some(first_size) => first_size as f64, + None => return 0, // No docsets provided, so return 0. + }; + + let mut smallest_docset_size = intersection_estimate; + // Assuming random distribution of terms, the probability of a document being in the + // intersection + for size in docset_sizes { + // Diminish the co-location factor for each additional set, or we will overestimate. + co_loc_factor = (co_loc_factor - 0.1).max(1.0); + intersection_estimate *= (size as f64 / max_docs as f64) * co_loc_factor; + smallest_docset_size = smallest_docset_size.min(size as f64); + } + + intersection_estimate.round().min(smallest_docset_size) as u32 +} + +/// Computes the estimated number of documents in the union of multiple docsets +/// given their sizes. +/// +/// # Arguments +/// * `docset_sizes` - An iterator over the sizes of the docsets (number of documents in each set). +/// * `max_docs` - The maximum number of docs that can hit, usually number of documents in the +/// segment. +/// +/// # Returns +/// The estimated number of documents in the union. +pub fn estimate_union(docset_sizes: I, max_docs: u32) -> u32 +where I: Iterator { + // Terms tend to be not really randomly distributed. + // This factor is used to adjust the estimate. + // Unlike intersection, the co-location reduces the estimate. + let co_loc_factor = 0.8; + + // The approach for union is to compute the probability of a document not being in any of the + // sets + let mut not_in_any_set_prob = 1.0; + + // Assuming random distribution of terms, the probability of a document being in the + // union is the complement of the probability of it not being in any of the sets. + for size in docset_sizes { + let prob_in_set = (size as f64 / max_docs as f64) * co_loc_factor; + not_in_any_set_prob *= 1.0 - prob_in_set; + } + + let union_estimate = (max_docs as f64 * (1.0 - not_in_any_set_prob)).round(); + + union_estimate.min(u32::MAX as f64) as u32 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_estimate_intersection_small1() { + let docset_sizes = &[500, 1000]; + let n = 10_000; + let result = estimate_intersection(docset_sizes.iter().copied(), n); + assert_eq!(result, 60); + } + + #[test] + fn test_estimate_intersection_small2() { + let docset_sizes = &[500, 1000, 1500]; + let n = 10_000; + let result = estimate_intersection(docset_sizes.iter().copied(), n); + assert_eq!(result, 10); + } + + #[test] + fn test_estimate_intersection_large_values() { + let docset_sizes = &[100_000, 50_000, 30_000]; + let n = 1_000_000; + let result = estimate_intersection(docset_sizes.iter().copied(), n); + assert_eq!(result, 198); + } + + #[test] + fn test_estimate_union_small() { + let docset_sizes = &[500, 1000, 1500]; + let n = 10000; + let result = estimate_union(docset_sizes.iter().copied(), n); + assert_eq!(result, 2228); + } + + #[test] + fn test_estimate_union_large_values() { + let docset_sizes = &[100000, 50000, 30000]; + let n = 1000000; + let result = estimate_union(docset_sizes.iter().copied(), n); + assert_eq!(result, 137997); + } + + #[test] + fn test_estimate_intersection_large() { + let docset_sizes: Vec<_> = (0..10).map(|_| 4_000_000).collect(); + let n = 5_000_000; + let result = estimate_intersection(docset_sizes.iter().copied(), n); + // Check that it doesn't overflow and returns a reasonable result + assert_eq!(result, 708_670); + } + + #[test] + fn test_estimate_intersection_overflow_safety() { + let docset_sizes: Vec<_> = (0..100).map(|_| 4_000_000).collect(); + let n = 5_000_000; + let result = estimate_intersection(docset_sizes.iter().copied(), n); + // Check that it doesn't overflow and returns a reasonable result + assert_eq!(result, 0); + } + + #[test] + fn test_estimate_union_overflow_safety() { + let docset_sizes: Vec<_> = (0..100).map(|_| 1_000_000).collect(); + let n = 20_000_000; + let result = estimate_union(docset_sizes.iter().copied(), n); + // Check that it doesn't overflow and returns a reasonable result + assert_eq!(result, 19_662_594); + } +} diff --git a/src/query/union/buffered_union.rs b/src/query/union/buffered_union.rs index 5fc946ee11..02e26f0991 100644 --- a/src/query/union/buffered_union.rs +++ b/src/query/union/buffered_union.rs @@ -2,6 +2,7 @@ use common::TinySet; use crate::docset::{DocSet, TERMINATED}; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; +use crate::query::size_hint::estimate_union; use crate::query::Scorer; use crate::{DocId, Score}; @@ -34,6 +35,7 @@ pub struct BufferedUnionScorer { offset: DocId, doc: DocId, score: Score, + num_docs: u32, } fn refill( @@ -65,6 +67,7 @@ impl BufferedUnionScorer, score_combiner_fn: impl FnOnce() -> TScoreCombiner, + num_docs: u32, ) -> BufferedUnionScorer { let non_empty_docsets: Vec = docsets .into_iter() @@ -78,6 +81,7 @@ impl BufferedUnionScorer DocId { self.doc } fn size_hint(&self) -> u32 { - self.docsets - .iter() - .map(|docset| docset.size_hint()) - .max() - .unwrap_or(0u32) + estimate_union(self.docsets.iter().map(DocSet::size_hint), self.num_docs) } + fn cost(&self) -> u64 { + self.docsets.iter().map(DocSet::cost).sum() + } + + // TODO Also implement `count` with deletes efficiently. fn count_including_deleted(&mut self) -> u32 { if self.doc == TERMINATED { return 0; diff --git a/src/query/union/mod.rs b/src/query/union/mod.rs index 84153e272f..539c6c3878 100644 --- a/src/query/union/mod.rs +++ b/src/query/union/mod.rs @@ -27,11 +27,17 @@ mod tests { docs_list.iter().cloned().map(VecDocSet::from) } fn union_from_docs_list(docs_list: &[Vec]) -> Box { + let max_doc = docs_list + .iter() + .flat_map(|docs| docs.iter().copied()) + .max() + .unwrap_or(0); Box::new(BufferedUnionScorer::build( vec_doc_set_from_docs_list(docs_list) .map(|docset| ConstScorer::new(docset, 1.0)) .collect::>>(), DoNothingCombiner::default, + max_doc, )) } @@ -273,6 +279,7 @@ mod bench { .map(|docset| ConstScorer::new(docset, 1.0)) .collect::>(), DoNothingCombiner::default, + 100_000, ); while v.doc() != TERMINATED { v.advance(); @@ -294,6 +301,7 @@ mod bench { .map(|docset| ConstScorer::new(docset, 1.0)) .collect::>(), DoNothingCombiner::default, + 100_000, ); while v.doc() != TERMINATED { v.advance(); diff --git a/src/query/union/simple_union.rs b/src/query/union/simple_union.rs index 041d4c90e1..b153a7f22a 100644 --- a/src/query/union/simple_union.rs +++ b/src/query/union/simple_union.rs @@ -92,6 +92,7 @@ impl DocSet for SimpleUnion { } fn size_hint(&self) -> u32 { + // TODO: use estimate_union self.docsets .iter() .map(|docset| docset.size_hint()) @@ -99,6 +100,10 @@ impl DocSet for SimpleUnion { .unwrap_or(0u32) } + fn cost(&self) -> u64 { + self.docsets.iter().map(|docset| docset.cost()).sum() + } + fn count_including_deleted(&mut self) -> u32 { if self.doc == TERMINATED { return 0u32;