Fully rollback the active doc ids idea in favor of slicing

softwaredoug · May 11, 2024 · 0e47c3f · 0e47c3f
1 parent ae9077b
commit 0e47c3f
Showing 1 changed file with 6 additions and 24 deletions.
diff --git a/searcharray/postings.py b/searcharray/postings.py
@@ -564,37 +564,24 @@ def memory_report(self):
     # ***********************************************************
     def termfreqs(self, token: Union[List[str], str],
                   slop: int = 0,
-                  active_docs: Optional[np.ndarray] = None,
                   min_posn: Optional[int] = None,
                   max_posn: Optional[int] = None) -> np.ndarray:
         token = self._check_token_arg(token)
         if isinstance(token, list):
-            return self.phrase_freq(token, slop=slop, min_posn=min_posn, max_posn=max_posn,
-                                    active_docs=active_docs)
+            return self.phrase_freq(token, slop=slop, min_posn=min_posn, max_posn=max_posn)
 
         try:
             term_id = self.term_dict.get_term_id(token)
             slice_of_rows = self.term_mat.rows
             if self.term_mat.subset:
                 matches = np.zeros(len(self), dtype=np.float32)
-                if active_docs is not None:
-                    slice_of_rows = self.term_mat.rows[active_docs > 0]
                 doc_ids, termfreqs = self.posns.termfreqs(term_id,
                                                           doc_ids=slice_of_rows,
                                                           min_posn=min_posn,
                                                           max_posn=max_posn)
                 mask = np.isin(self.term_mat.rows, doc_ids)
                 matches[mask] = termfreqs
                 return matches
-            elif active_docs is not None:
-                matches = np.zeros(len(self), dtype=np.float32)
-                slice_of_rows = self.term_mat.rows[active_docs > 0]
-                doc_ids, termfreqs = self.posns.termfreqs(term_id,
-                                                          doc_ids=slice_of_rows,
-                                                          min_posn=min_posn,
-                                                          max_posn=max_posn)
-                matches[doc_ids] = termfreqs
-                return matches
             else:
                 doc_ids, termfreqs = self.posns.termfreqs(term_id,
                                                           doc_ids=None,
@@ -625,10 +612,10 @@ def match(self, token: Union[List[str], str], slop: int = 0) -> np.ndarray:
             term_freq = self.termfreqs(token)
         return term_freq > 0
 
-    def score(self, token: Union[str, List[str]], similarity: Similarity = default_bm25,
+    def score(self, token: Union[str, List[str]],
+              similarity: Similarity = default_bm25,
               min_posn: Optional[int] = None,
-              max_posn: Optional[int] = None,
-              active_docs: Optional[np.ndarray] = None) -> np.ndarray:
+              max_posn: Optional[int] = None) -> np.ndarray:
         """Score each doc using a similarity function.
 
         Parameters
@@ -637,7 +624,6 @@ def score(self, token: Union[str, List[str]], similarity: Similarity = default_b
         similarity : How to score the documents. Default is BM25.
         min_posn : int - minimum position of the term in the document, in multiples of 18
         max_posn : int - maximum position of the term in the document, in multiples of 18
-        active_docs : np.ndarray - a boolean mask of which documents to score, the rest will receive 0
         """
         # Get term freqs per token
         token = self._check_token_arg(token)
@@ -647,8 +633,7 @@ def score(self, token: Union[str, List[str]], similarity: Similarity = default_b
         tokens_l = [token] if isinstance(token, str) else token
         all_dfs = np.asarray([self.docfreq(token) for token in tokens_l])
 
-        tfs = self.termfreqs(token, min_posn=min_posn, max_posn=max_posn,
-                             active_docs=active_docs)
+        tfs = self.termfreqs(token, min_posn=min_posn, max_posn=max_posn)
         token = self._check_token_arg(token)
         doc_lens = self.doclengths()
 
@@ -678,15 +663,12 @@ def or_query(self, tokens: Union[List[str], List[List[str]]], min_should_match:
 
     def phrase_freq(self, tokens: List[str],
                     slop=0,
-                    active_docs: Optional[np.ndarray] = None,
                     min_posn: Optional[int] = None,
                     max_posn: Optional[int] = None) -> np.ndarray:
         try:
             # Decide how/if we need to filter doc ids
             doc_ids = None
-            if active_docs is not None:
-                doc_ids = self.term_mat.rows[active_docs > 0]
-            elif self.term_mat.subset:
+            if self.term_mat.subset:
                 doc_ids = self.term_mat.rows
 
             term_ids = [self.term_dict.get_term_id(token) for token in tokens]