Skip to content

Commit

Permalink
Fully rollback the active doc ids idea in favor of slicing
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed May 11, 2024
1 parent ae9077b commit 0e47c3f
Showing 1 changed file with 6 additions and 24 deletions.
30 changes: 6 additions & 24 deletions searcharray/postings.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,37 +564,24 @@ def memory_report(self):
# ***********************************************************
def termfreqs(self, token: Union[List[str], str],
slop: int = 0,
active_docs: Optional[np.ndarray] = None,
min_posn: Optional[int] = None,
max_posn: Optional[int] = None) -> np.ndarray:
token = self._check_token_arg(token)
if isinstance(token, list):
return self.phrase_freq(token, slop=slop, min_posn=min_posn, max_posn=max_posn,
active_docs=active_docs)
return self.phrase_freq(token, slop=slop, min_posn=min_posn, max_posn=max_posn)

try:
term_id = self.term_dict.get_term_id(token)
slice_of_rows = self.term_mat.rows
if self.term_mat.subset:
matches = np.zeros(len(self), dtype=np.float32)
if active_docs is not None:
slice_of_rows = self.term_mat.rows[active_docs > 0]
doc_ids, termfreqs = self.posns.termfreqs(term_id,
doc_ids=slice_of_rows,
min_posn=min_posn,
max_posn=max_posn)
mask = np.isin(self.term_mat.rows, doc_ids)
matches[mask] = termfreqs
return matches
elif active_docs is not None:
matches = np.zeros(len(self), dtype=np.float32)
slice_of_rows = self.term_mat.rows[active_docs > 0]
doc_ids, termfreqs = self.posns.termfreqs(term_id,
doc_ids=slice_of_rows,
min_posn=min_posn,
max_posn=max_posn)
matches[doc_ids] = termfreqs
return matches
else:
doc_ids, termfreqs = self.posns.termfreqs(term_id,
doc_ids=None,
Expand Down Expand Up @@ -625,10 +612,10 @@ def match(self, token: Union[List[str], str], slop: int = 0) -> np.ndarray:
term_freq = self.termfreqs(token)
return term_freq > 0

def score(self, token: Union[str, List[str]], similarity: Similarity = default_bm25,
def score(self, token: Union[str, List[str]],
similarity: Similarity = default_bm25,
min_posn: Optional[int] = None,
max_posn: Optional[int] = None,
active_docs: Optional[np.ndarray] = None) -> np.ndarray:
max_posn: Optional[int] = None) -> np.ndarray:
"""Score each doc using a similarity function.
Parameters
Expand All @@ -637,7 +624,6 @@ def score(self, token: Union[str, List[str]], similarity: Similarity = default_b
similarity : How to score the documents. Default is BM25.
min_posn : int - minimum position of the term in the document, in multiples of 18
max_posn : int - maximum position of the term in the document, in multiples of 18
active_docs : np.ndarray - a boolean mask of which documents to score, the rest will receive 0
"""
# Get term freqs per token
token = self._check_token_arg(token)
Expand All @@ -647,8 +633,7 @@ def score(self, token: Union[str, List[str]], similarity: Similarity = default_b
tokens_l = [token] if isinstance(token, str) else token
all_dfs = np.asarray([self.docfreq(token) for token in tokens_l])

tfs = self.termfreqs(token, min_posn=min_posn, max_posn=max_posn,
active_docs=active_docs)
tfs = self.termfreqs(token, min_posn=min_posn, max_posn=max_posn)
token = self._check_token_arg(token)
doc_lens = self.doclengths()

Expand Down Expand Up @@ -678,15 +663,12 @@ def or_query(self, tokens: Union[List[str], List[List[str]]], min_should_match:

def phrase_freq(self, tokens: List[str],
slop=0,
active_docs: Optional[np.ndarray] = None,
min_posn: Optional[int] = None,
max_posn: Optional[int] = None) -> np.ndarray:
try:
# Decide how/if we need to filter doc ids
doc_ids = None
if active_docs is not None:
doc_ids = self.term_mat.rows[active_docs > 0]
elif self.term_mat.subset:
if self.term_mat.subset:
doc_ids = self.term_mat.rows

term_ids = [self.term_dict.get_term_id(token) for token in tokens]
Expand Down

0 comments on commit 0e47c3f

Please sign in to comment.