Skip to content

Commit

Permalink
Experiment with tf cache
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jul 15, 2024
1 parent feb43f3 commit 6878f11
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 4 deletions.
7 changes: 7 additions & 0 deletions searcharray/phrase/memmap_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ def __getitem__(self, key):
else:
raise KeyError(f'Key {key} not found.')

def item_len(self, key):
key = int(key)
if key in self.metadata:
return self.metadata[key]['length']
else:
raise KeyError(f'Key {key} not found.')

def __setitem__(self, key, value):
key = int(key)
if value.dtype != self.dtype:
Expand Down
17 changes: 14 additions & 3 deletions searcharray/phrase/middle_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,20 @@ def __iter__(self):
def __len__(self):
return len(self.doc_ids)

def item_len(self, key):
if not self.base_item:
return 0
return self.base.item_len(key)


class PosnBitArray:

def __init__(self, encoded_term_posns: Union[ArrayDict, FilteredPosns], max_doc_id: int):
def __init__(self, encoded_term_posns: Union[ArrayDict, FilteredPosns],
max_doc_id: int,
cache_at_len: int = 10):
self.encoded_term_posns = encoded_term_posns
self.max_doc_id = max_doc_id
self.cache_at_len = cache_at_len
self.docfreq_cache : Dict[int, np.uint64] = {}
self.termfreq_cache : Dict[int, Tuple[np.ndarray, np.ndarray]] = {}

Expand Down Expand Up @@ -470,6 +478,8 @@ def _termfreqs_with_cache(self, term_id: int) -> Tuple[np.ndarray, np.ndarray]:
term_posns = self.encoded_term_posns[term_id]
doc_ids, term_freqs = self._computed_term_freqs(term_posns)
if self._is_cached(term_id):
doc_ids = doc_ids[term_freqs > 0]
term_freqs = term_freqs[term_freqs > 0]
self.termfreq_cache[term_id] = (doc_ids, term_freqs)
return doc_ids, term_freqs

Expand All @@ -480,8 +490,9 @@ def _docfreq_from_cache(self, term_id: int) -> np.uint64:
return self.docfreq_cache[term_id]

def _maybe_cache_docfreq(self, term_id: int, docfreq: np.uint64):
if self.max_doc_id >= 99999 and docfreq > (self.max_doc_id // 100):
self.docfreq_cache[term_id] = docfreq
if self.encoded_term_posns:
if self.encoded_term_posns.item_len(term_id) > self.cache_at_len:
self.docfreq_cache[term_id] = docfreq

def docfreq(self, term_id: int) -> np.uint64:
try:
Expand Down
4 changes: 4 additions & 0 deletions searcharray/postings.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,10 @@ def value_counts(
counts = Counter(self[:])
return pd.Series(counts)

def tf_cache_at(self, terms_len):
"""Cache term frequencies at a given length."""
self.posns.cache_at_len = terms_len

def __len__(self):
len_rval = len(self.term_mat.rows)
return len_rval
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# For a discussion on single-sourcing the version across setup.py and the
# project code, see
# https://packaging.python.org/guides/single-sourcing-package-version/
version="0.0.64", # Required
version="0.0.65", # Required
# This is a one-line description or tagline of what your project does. This
# corresponds to the "Summary" metadata field:
# https://packaging.python.org/specifications/core-metadata/#summary
Expand Down

0 comments on commit 6878f11

Please sign in to comment.