Skip to content

Commit

Permalink
BM25 perf improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Mar 6, 2024
1 parent 6f94e06 commit b7929b9
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
2 changes: 1 addition & 1 deletion searcharray/phrase/middle_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def _computed_term_freqs(self, term_posns) -> Tuple[np.ndarray, np.ndarray]:
posns = term_posns & encoder.payload_lsb_mask
bit_counts = bit_count64(posns)

term_freqs = np.add.reduceat(bit_counts, change_indices)
term_freqs = np.add.reduceat(bit_counts, change_indices, dtype=np.float32)
return sorted_unique(doc_ids), term_freqs

def _termfreqs_with_cache(self, term_id: int) -> Tuple[np.ndarray, np.ndarray]:
Expand Down
15 changes: 13 additions & 2 deletions searcharray/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ def __call__(self, term_freqs: np.ndarray, doc_freqs: np.ndarray, doc_lens: np.n
...


def compute_idf(num_docs, sum_dfs):
"""Calculate idf."""
return np.log(1 + (num_docs - sum_dfs + 0.5) / (sum_dfs + 0.5))


def compute_tfs(term_freqs: np.ndarray, doc_lens, avg_doc_lens, k1, b):
adj_doc_lens = k1 * (1 - b + b * doc_lens / avg_doc_lens)
return term_freqs / (term_freqs + adj_doc_lens)


def bm25_similarity(k1: float = 1.2, b: float = 0.75) -> Similarity:
"""BM25 similarity function, as in Lucene 9."""
def bm25(term_freqs: np.ndarray, doc_freqs: np.ndarray,
Expand All @@ -21,9 +31,10 @@ def bm25(term_freqs: np.ndarray, doc_freqs: np.ndarray,
# Sum doc freqs
sum_dfs = np.sum(doc_freqs, axis=0)
# Calculate idf
idf = np.log(1 + (num_docs - sum_dfs + 0.5) / (sum_dfs + 0.5))
idf = compute_idf(num_docs, sum_dfs)
# Calculate tf
tf = term_freqs / (term_freqs + k1 * (1 - b + b * doc_lens / avg_doc_lens))
# tf = term_freqs / (term_freqs + k1 * (1 - b + b * doc_lens / avg_doc_lens))
tf = compute_tfs(term_freqs, doc_lens, avg_doc_lens, k1, b)
return idf * tf
return bm25

Expand Down

0 comments on commit b7929b9

Please sign in to comment.