From 0f1cd80427142128d91a5bcacee7ea4f73b8a054 Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Wed, 6 Mar 2024 10:43:26 -0500 Subject: [PATCH] Divide tf in place in similarity --- searcharray/postings.py | 4 ++-- searcharray/similarity.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/searcharray/postings.py b/searcharray/postings.py index 5c4eb3d..b2cb2b3 100644 --- a/searcharray/postings.py +++ b/searcharray/postings.py @@ -526,7 +526,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray: try: term_id = self.term_dict.get_term_id(token) - matches = np.zeros(len(self), dtype=int) + matches = np.zeros(len(self), dtype=np.float32) slice_of_rows = None if self.term_mat.subset: slice_of_rows = self.term_mat.rows @@ -541,7 +541,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray: matches[doc_ids] = termfreqs return matches except TermMissingError: - return np.zeros(len(self), dtype=int) + return np.zeros(len(self), dtype=np.float32) def docfreq(self, token: str) -> int: if not isinstance(token, str): diff --git a/searcharray/similarity.py b/searcharray/similarity.py index 9c635ee..70ec1c2 100644 --- a/searcharray/similarity.py +++ b/searcharray/similarity.py @@ -19,7 +19,10 @@ def compute_idf(num_docs, sum_dfs): def compute_tfs(term_freqs: np.ndarray, doc_lens, avg_doc_lens, k1, b): adj_doc_lens = k1 * (1 - b + b * doc_lens / avg_doc_lens) - return term_freqs / (term_freqs + adj_doc_lens) + # Divide tf in place for perf, but this means + # we can't use the same term_freqs for different k1, b + term_freqs /= (term_freqs + adj_doc_lens) + return term_freqs def bm25_similarity(k1: float = 1.2, b: float = 0.75) -> Similarity: