From 6878f11a3ed02957ebce750e2a5714d8bc00c7da Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Mon, 15 Jul 2024 07:35:25 -0400 Subject: [PATCH] Experiment with tf cache --- searcharray/phrase/memmap_arrays.py | 7 +++++++ searcharray/phrase/middle_out.py | 17 ++++++++++++++--- searcharray/postings.py | 4 ++++ setup.py | 2 +- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/searcharray/phrase/memmap_arrays.py b/searcharray/phrase/memmap_arrays.py index e4fafd2..5444dc2 100644 --- a/searcharray/phrase/memmap_arrays.py +++ b/searcharray/phrase/memmap_arrays.py @@ -93,6 +93,13 @@ def __getitem__(self, key): else: raise KeyError(f'Key {key} not found.') + def item_len(self, key): + key = int(key) + if key in self.metadata: + return self.metadata[key]['length'] + else: + raise KeyError(f'Key {key} not found.') + def __setitem__(self, key, value): key = int(key) if value.dtype != self.dtype: diff --git a/searcharray/phrase/middle_out.py b/searcharray/phrase/middle_out.py index 4ca5eea..291c940 100644 --- a/searcharray/phrase/middle_out.py +++ b/searcharray/phrase/middle_out.py @@ -292,12 +292,20 @@ def __iter__(self): def __len__(self): return len(self.doc_ids) + def item_len(self, key): + if not self.base_item: + return 0 + return self.base.item_len(key) + class PosnBitArray: - def __init__(self, encoded_term_posns: Union[ArrayDict, FilteredPosns], max_doc_id: int): + def __init__(self, encoded_term_posns: Union[ArrayDict, FilteredPosns], + max_doc_id: int, + cache_at_len: int = 10): self.encoded_term_posns = encoded_term_posns self.max_doc_id = max_doc_id + self.cache_at_len = cache_at_len self.docfreq_cache : Dict[int, np.uint64] = {} self.termfreq_cache : Dict[int, Tuple[np.ndarray, np.ndarray]] = {} @@ -470,6 +478,8 @@ def _termfreqs_with_cache(self, term_id: int) -> Tuple[np.ndarray, np.ndarray]: term_posns = self.encoded_term_posns[term_id] doc_ids, term_freqs = self._computed_term_freqs(term_posns) if self._is_cached(term_id): + doc_ids = doc_ids[term_freqs > 0] + term_freqs = term_freqs[term_freqs > 0] self.termfreq_cache[term_id] = (doc_ids, term_freqs) return doc_ids, term_freqs @@ -480,8 +490,9 @@ def _docfreq_from_cache(self, term_id: int) -> np.uint64: return self.docfreq_cache[term_id] def _maybe_cache_docfreq(self, term_id: int, docfreq: np.uint64): - if self.max_doc_id >= 99999 and docfreq > (self.max_doc_id // 100): - self.docfreq_cache[term_id] = docfreq + if self.encoded_term_posns: + if self.encoded_term_posns.item_len(term_id) > self.cache_at_len: + self.docfreq_cache[term_id] = docfreq def docfreq(self, term_id: int) -> np.uint64: try: diff --git a/searcharray/postings.py b/searcharray/postings.py index 8b7cfc7..57f8097 100644 --- a/searcharray/postings.py +++ b/searcharray/postings.py @@ -432,6 +432,10 @@ def value_counts( counts = Counter(self[:]) return pd.Series(counts) + def tf_cache_at(self, terms_len): + """Cache term frequencies at a given length.""" + self.posns.cache_at_len = terms_len + def __len__(self): len_rval = len(self.term_mat.rows) return len_rval diff --git a/setup.py b/setup.py index 220af63..3c6587b 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/guides/single-sourcing-package-version/ - version="0.0.64", # Required + version="0.0.65", # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: # https://packaging.python.org/specifications/core-metadata/#summary