Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefix phrase query optim #2425

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 30 additions & 10 deletions src/query/phrase_prefix_query/phrase_prefix_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
use crate::query::phrase_query::{intersection_count, PhraseScorer};
use crate::query::phrase_query::{intersection_count, intersection_exists, PhraseScorer};
use crate::query::Scorer;
use crate::{DocId, Score};

Expand Down Expand Up @@ -92,22 +92,25 @@
}
}

pub struct PhrasePrefixScorer<TPostings: Postings> {
pub struct PhrasePrefixScorer<TPostings: Postings, const SCORING_ENABLED: bool> {
phrase_scorer: PhraseKind<TPostings>,
suffixes: Vec<TPostings>,
suffix_offset: u32,
phrase_count: u32,
suffix_position_buffer: Vec<u32>,
}

impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
impl<TPostings: Postings, const SCORING_ENABLED: bool>
PhrasePrefixScorer<TPostings, SCORING_ENABLED>
{
// If similarity_weight is None, then scoring is disabled.
pub fn new(
mut term_postings: Vec<(usize, TPostings)>,
similarity_weight_opt: Option<Bm25Weight>,
fieldnorm_reader: FieldNormReader,
suffixes: Vec<TPostings>,
suffix_pos: usize,
) -> PhrasePrefixScorer<TPostings> {
) -> PhrasePrefixScorer<TPostings, SCORING_ENABLED> {
// correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
let max_offset = term_postings
.iter()
Expand Down Expand Up @@ -140,6 +143,7 @@
suffixes,
suffix_offset: (max_offset - suffix_pos) as u32,
phrase_count: 0,
suffix_position_buffer: Vec::with_capacity(100),
};
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
phrase_prefix_scorer.advance();
Expand All @@ -153,7 +157,6 @@

fn matches_prefix(&mut self) -> bool {
let mut count = 0;
let mut positions = Vec::new();
let current_doc = self.doc();
let pos_matching = self.phrase_scorer.get_intersection();
for suffix in &mut self.suffixes {
Expand All @@ -162,16 +165,27 @@
}
let doc = suffix.seek(current_doc);
if doc == current_doc {
suffix.positions_with_offset(self.suffix_offset, &mut positions);
count += intersection_count(pos_matching, &positions);
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
if SCORING_ENABLED {
count += intersection_count(pos_matching, &self.suffix_position_buffer);
} else {
if intersection_exists(pos_matching, &self.suffix_position_buffer) {
return true;
}
}

Check warning on line 175 in src/query/phrase_prefix_query/phrase_prefix_scorer.rs

View workflow job for this annotation

GitHub Actions / clippy

this `else { if .. }` block can be collapsed

warning: this `else { if .. }` block can be collapsed --> src/query/phrase_prefix_query/phrase_prefix_scorer.rs:171:24 | 171 | } else { | ________________________^ 172 | | if intersection_exists(pos_matching, &self.suffix_position_buffer) { 173 | | return true; 174 | | } 175 | | } | |_________________^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#collapsible_else_if = note: `#[warn(clippy::collapsible_else_if)]` on by default help: collapse nested if block | 171 ~ } else if intersection_exists(pos_matching, &self.suffix_position_buffer) { 172 + return true; 173 + } |
}
}
if !SCORING_ENABLED {
return false;
}
self.phrase_count = count as u32;
count != 0
}
}

impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
impl<TPostings: Postings, const SCORING_ENABLED: bool> DocSet
for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
{
fn advance(&mut self) -> DocId {
loop {
let doc = self.phrase_scorer.advance();
Expand All @@ -198,9 +212,15 @@
}
}

impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
impl<TPostings: Postings, const SCORING_ENABLED: bool> Scorer
for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
{
fn score(&mut self) -> Score {
if SCORING_ENABLED {
self.phrase_scorer.score()
} else {
1.0f32
}
// TODO modify score??
self.phrase_scorer.score()
}
}
53 changes: 45 additions & 8 deletions src/query/phrase_prefix_query/phrase_prefix_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
Ok(FieldNormReader::constant(reader.max_doc(), 1))
}

pub(crate) fn phrase_scorer(
pub(crate) fn phrase_prefix_scorer<const SCORING_ENABLED: bool>(
&self,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings, SCORING_ENABLED>>> {
let similarity_weight_opt = self
.similarity_weight_opt
.as_ref()
Expand Down Expand Up @@ -128,15 +128,20 @@

impl Weight for PhrasePrefixWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
if self.similarity_weight_opt.is_some() {
if let Some(scorer) = self.phrase_prefix_scorer::<true>(reader, boost)? {
return Ok(Box::new(scorer));
}
} else {
Ok(Box::new(EmptyScorer))
if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? {
return Ok(Box::new(scorer));
}
}

Check warning on line 139 in src/query/phrase_prefix_query/phrase_prefix_weight.rs

View workflow job for this annotation

GitHub Actions / clippy

this `else { if .. }` block can be collapsed

warning: this `else { if .. }` block can be collapsed --> src/query/phrase_prefix_query/phrase_prefix_weight.rs:135:16 | 135 | } else { | ________________^ 136 | | if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? { 137 | | return Ok(Box::new(scorer)); 138 | | } 139 | | } | |_________^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#collapsible_else_if help: collapse nested if block | 135 ~ } else if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? { 136 + return Ok(Box::new(scorer)); 137 + } |
Ok(Box::new(EmptyScorer))
}

fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
let scorer_opt = self.phrase_prefix_scorer::<true>(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
}
Expand Down Expand Up @@ -200,7 +205,7 @@
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
Expand All @@ -211,6 +216,38 @@
Ok(())
}

#[test]
pub fn test_phrase_no_count() -> crate::Result<()> {
let index = create_index(&[
"aa bb dd cc",
"aa aa bb c dd aa bb cc aa bb dc",
" aa bb cd",
])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = PhrasePrefixQuery::new(vec![
Term::from_field_text(text_field, "aa"),
Term::from_field_text(text_field, "bb"),
Term::from_field_text(text_field, "c"),
]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query
.phrase_prefix_query_weight(enable_scoring)
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.phrase_prefix_scorer::<false>(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 0);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 0);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}

#[test]
pub fn test_phrase_count_mid() -> crate::Result<()> {
let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
Expand All @@ -227,7 +264,7 @@
.unwrap()
.unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
Expand Down
2 changes: 1 addition & 1 deletion src/query/phrase_query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ mod phrase_scorer;
mod phrase_weight;

pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_scorer::intersection_count;
pub use self::phrase_scorer::PhraseScorer;
pub(crate) use self::phrase_scorer::{intersection_count, intersection_exists};
pub use self::phrase_weight::PhraseWeight;

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion src/query/phrase_query/phrase_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ pub struct PhraseScorer<TPostings: Postings> {
}

/// Returns true if and only if the two sorted arrays contain a common element
fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
let mut left_index = 0;
let mut right_index = 0;
while left_index < left.len() && right_index < right.len() {
Expand Down
Loading