From 0e47c3f29734d6cb9abf48e546355ba4999080cf Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Fri, 10 May 2024 22:07:50 -0400 Subject: [PATCH] Fully rollback the active doc ids idea in favor of slicing --- searcharray/postings.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/searcharray/postings.py b/searcharray/postings.py index 306af14..5981b1c 100644 --- a/searcharray/postings.py +++ b/searcharray/postings.py @@ -564,21 +564,17 @@ def memory_report(self): # *********************************************************** def termfreqs(self, token: Union[List[str], str], slop: int = 0, - active_docs: Optional[np.ndarray] = None, min_posn: Optional[int] = None, max_posn: Optional[int] = None) -> np.ndarray: token = self._check_token_arg(token) if isinstance(token, list): - return self.phrase_freq(token, slop=slop, min_posn=min_posn, max_posn=max_posn, - active_docs=active_docs) + return self.phrase_freq(token, slop=slop, min_posn=min_posn, max_posn=max_posn) try: term_id = self.term_dict.get_term_id(token) slice_of_rows = self.term_mat.rows if self.term_mat.subset: matches = np.zeros(len(self), dtype=np.float32) - if active_docs is not None: - slice_of_rows = self.term_mat.rows[active_docs > 0] doc_ids, termfreqs = self.posns.termfreqs(term_id, doc_ids=slice_of_rows, min_posn=min_posn, @@ -586,15 +582,6 @@ def termfreqs(self, token: Union[List[str], str], mask = np.isin(self.term_mat.rows, doc_ids) matches[mask] = termfreqs return matches - elif active_docs is not None: - matches = np.zeros(len(self), dtype=np.float32) - slice_of_rows = self.term_mat.rows[active_docs > 0] - doc_ids, termfreqs = self.posns.termfreqs(term_id, - doc_ids=slice_of_rows, - min_posn=min_posn, - max_posn=max_posn) - matches[doc_ids] = termfreqs - return matches else: doc_ids, termfreqs = self.posns.termfreqs(term_id, doc_ids=None, @@ -625,10 +612,10 @@ def match(self, token: Union[List[str], str], slop: int = 0) -> np.ndarray: term_freq = self.termfreqs(token) return term_freq > 0 - def score(self, token: Union[str, List[str]], similarity: Similarity = default_bm25, + def score(self, token: Union[str, List[str]], + similarity: Similarity = default_bm25, min_posn: Optional[int] = None, - max_posn: Optional[int] = None, - active_docs: Optional[np.ndarray] = None) -> np.ndarray: + max_posn: Optional[int] = None) -> np.ndarray: """Score each doc using a similarity function. Parameters @@ -637,7 +624,6 @@ def score(self, token: Union[str, List[str]], similarity: Similarity = default_b similarity : How to score the documents. Default is BM25. min_posn : int - minimum position of the term in the document, in multiples of 18 max_posn : int - maximum position of the term in the document, in multiples of 18 - active_docs : np.ndarray - a boolean mask of which documents to score, the rest will receive 0 """ # Get term freqs per token token = self._check_token_arg(token) @@ -647,8 +633,7 @@ def score(self, token: Union[str, List[str]], similarity: Similarity = default_b tokens_l = [token] if isinstance(token, str) else token all_dfs = np.asarray([self.docfreq(token) for token in tokens_l]) - tfs = self.termfreqs(token, min_posn=min_posn, max_posn=max_posn, - active_docs=active_docs) + tfs = self.termfreqs(token, min_posn=min_posn, max_posn=max_posn) token = self._check_token_arg(token) doc_lens = self.doclengths() @@ -678,15 +663,12 @@ def or_query(self, tokens: Union[List[str], List[List[str]]], min_should_match: def phrase_freq(self, tokens: List[str], slop=0, - active_docs: Optional[np.ndarray] = None, min_posn: Optional[int] = None, max_posn: Optional[int] = None) -> np.ndarray: try: # Decide how/if we need to filter doc ids doc_ids = None - if active_docs is not None: - doc_ids = self.term_mat.rows[active_docs > 0] - elif self.term_mat.subset: + if self.term_mat.subset: doc_ids = self.term_mat.rows term_ids = [self.term_dict.get_term_id(token) for token in tokens]