Skip to content

Commit

Permalink
Divide tf in place in similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Mar 6, 2024
1 parent b7929b9 commit 0f1cd80
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
4 changes: 2 additions & 2 deletions searcharray/postings.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray:

try:
term_id = self.term_dict.get_term_id(token)
matches = np.zeros(len(self), dtype=int)
matches = np.zeros(len(self), dtype=np.float32)
slice_of_rows = None
if self.term_mat.subset:
slice_of_rows = self.term_mat.rows
Expand All @@ -541,7 +541,7 @@ def termfreqs(self, token: Union[List[str], str]) -> np.ndarray:
matches[doc_ids] = termfreqs
return matches
except TermMissingError:
return np.zeros(len(self), dtype=int)
return np.zeros(len(self), dtype=np.float32)

def docfreq(self, token: str) -> int:
if not isinstance(token, str):
Expand Down
5 changes: 4 additions & 1 deletion searcharray/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ def compute_idf(num_docs, sum_dfs):

def compute_tfs(term_freqs: np.ndarray, doc_lens, avg_doc_lens, k1, b):
adj_doc_lens = k1 * (1 - b + b * doc_lens / avg_doc_lens)
return term_freqs / (term_freqs + adj_doc_lens)
# Divide tf in place for perf, but this means
# we can't use the same term_freqs for different k1, b
term_freqs /= (term_freqs + adj_doc_lens)
return term_freqs


def bm25_similarity(k1: float = 1.2, b: float = 0.75) -> Similarity:
Expand Down

0 comments on commit 0f1cd80

Please sign in to comment.