From 0f1cd80427142128d91a5bcacee7ea4f73b8a054 Mon Sep 17 00:00:00 2001
From: Doug Turnbull <softwaredoug@gmail.com>
Date: Wed, 6 Mar 2024 10:43:26 -0500
Subject: [PATCH] Divide tf in place in similarity

---
 searcharray/postings.py   | 4 ++--
 searcharray/similarity.py | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/searcharray/postings.py b/searcharray/postings.py
index 5c4eb3d..b2cb2b3 100644
--- a/searcharray/postings.py
+++ b/searcharray/postings.py
@@ -526,7 +526,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray:
 
         try:
             term_id = self.term_dict.get_term_id(token)
-            matches = np.zeros(len(self), dtype=int)
+            matches = np.zeros(len(self), dtype=np.float32)
             slice_of_rows = None
             if self.term_mat.subset:
                 slice_of_rows = self.term_mat.rows
@@ -541,7 +541,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray:
                 matches[doc_ids] = termfreqs
                 return matches
         except TermMissingError:
-            return np.zeros(len(self), dtype=int)
+            return np.zeros(len(self), dtype=np.float32)
 
     def docfreq(self, token: str) -> int:
         if not isinstance(token, str):
diff --git a/searcharray/similarity.py b/searcharray/similarity.py
index 9c635ee..70ec1c2 100644
--- a/searcharray/similarity.py
+++ b/searcharray/similarity.py
@@ -19,7 +19,10 @@ def compute_idf(num_docs, sum_dfs):
 
 def compute_tfs(term_freqs: np.ndarray, doc_lens, avg_doc_lens, k1, b):
     adj_doc_lens = k1 * (1 - b + b * doc_lens / avg_doc_lens)
-    return term_freqs / (term_freqs + adj_doc_lens)
+    # Divide tf in place for perf, but this means
+    # we can't use the same term_freqs for different k1, b
+    term_freqs /= (term_freqs + adj_doc_lens)
+    return term_freqs
 
 
 def bm25_similarity(k1: float = 1.2, b: float = 0.75) -> Similarity: