Divide tf in place in similarity

softwaredoug · Mar 6, 2024 · 0f1cd80 · 0f1cd80
1 parent b7929b9
commit 0f1cd80
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 3 deletions.
diff --git a/searcharray/postings.py b/searcharray/postings.py
@@ -526,7 +526,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray:
 
         try:
             term_id = self.term_dict.get_term_id(token)
-            matches = np.zeros(len(self), dtype=int)
+            matches = np.zeros(len(self), dtype=np.float32)
             slice_of_rows = None
             if self.term_mat.subset:
                 slice_of_rows = self.term_mat.rows
@@ -541,7 +541,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray:
                 matches[doc_ids] = termfreqs
                 return matches
         except TermMissingError:
-            return np.zeros(len(self), dtype=int)
+            return np.zeros(len(self), dtype=np.float32)
 
     def docfreq(self, token: str) -> int:
         if not isinstance(token, str):

diff --git a/searcharray/similarity.py b/searcharray/similarity.py
@@ -19,7 +19,10 @@ def compute_idf(num_docs, sum_dfs):
 
 def compute_tfs(term_freqs: np.ndarray, doc_lens, avg_doc_lens, k1, b):
     adj_doc_lens = k1 * (1 - b + b * doc_lens / avg_doc_lens)
-    return term_freqs / (term_freqs + adj_doc_lens)
+    # Divide tf in place for perf, but this means
+    # we can't use the same term_freqs for different k1, b
+    term_freqs /= (term_freqs + adj_doc_lens)
+    return term_freqs
 
 
 def bm25_similarity(k1: float = 1.2, b: float = 0.75) -> Similarity: