From b7929b9f782107e2b46e95c28b42badf3b9c4184 Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Wed, 6 Mar 2024 09:57:59 -0500 Subject: [PATCH] BM25 perf improvements --- searcharray/phrase/middle_out.py | 2 +- searcharray/similarity.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/searcharray/phrase/middle_out.py b/searcharray/phrase/middle_out.py index 16cfe45..2ff4e57 100644 --- a/searcharray/phrase/middle_out.py +++ b/searcharray/phrase/middle_out.py @@ -426,7 +426,7 @@ def _computed_term_freqs(self, term_posns) -> Tuple[np.ndarray, np.ndarray]: posns = term_posns & encoder.payload_lsb_mask bit_counts = bit_count64(posns) - term_freqs = np.add.reduceat(bit_counts, change_indices) + term_freqs = np.add.reduceat(bit_counts, change_indices, dtype=np.float32) return sorted_unique(doc_ids), term_freqs def _termfreqs_with_cache(self, term_id: int) -> Tuple[np.ndarray, np.ndarray]: diff --git a/searcharray/similarity.py b/searcharray/similarity.py index 32e8bee..9c635ee 100644 --- a/searcharray/similarity.py +++ b/searcharray/similarity.py @@ -12,6 +12,16 @@ def __call__(self, term_freqs: np.ndarray, doc_freqs: np.ndarray, doc_lens: np.n ... +def compute_idf(num_docs, sum_dfs): + """Calculate idf.""" + return np.log(1 + (num_docs - sum_dfs + 0.5) / (sum_dfs + 0.5)) + + +def compute_tfs(term_freqs: np.ndarray, doc_lens, avg_doc_lens, k1, b): + adj_doc_lens = k1 * (1 - b + b * doc_lens / avg_doc_lens) + return term_freqs / (term_freqs + adj_doc_lens) + + def bm25_similarity(k1: float = 1.2, b: float = 0.75) -> Similarity: """BM25 similarity function, as in Lucene 9.""" def bm25(term_freqs: np.ndarray, doc_freqs: np.ndarray, @@ -21,9 +31,10 @@ def bm25(term_freqs: np.ndarray, doc_freqs: np.ndarray, # Sum doc freqs sum_dfs = np.sum(doc_freqs, axis=0) # Calculate idf - idf = np.log(1 + (num_docs - sum_dfs + 0.5) / (sum_dfs + 0.5)) + idf = compute_idf(num_docs, sum_dfs) # Calculate tf - tf = term_freqs / (term_freqs + k1 * (1 - b + b * doc_lens / avg_doc_lens)) + # tf = term_freqs / (term_freqs + k1 * (1 - b + b * doc_lens / avg_doc_lens)) + tf = compute_tfs(term_freqs, doc_lens, avg_doc_lens, k1, b) return idf * tf return bm25