Experiment with tf cache

softwaredoug · Jul 15, 2024 · 6878f11 · 6878f11
1 parent feb43f3
commit 6878f11
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 4 deletions.
diff --git a/searcharray/phrase/memmap_arrays.py b/searcharray/phrase/memmap_arrays.py
@@ -93,6 +93,13 @@ def __getitem__(self, key):
         else:
             raise KeyError(f'Key {key} not found.')
 
+    def item_len(self, key):
+        key = int(key)
+        if key in self.metadata:
+            return self.metadata[key]['length']
+        else:
+            raise KeyError(f'Key {key} not found.')
+
     def __setitem__(self, key, value):
         key = int(key)
         if value.dtype != self.dtype:

diff --git a/searcharray/phrase/middle_out.py b/searcharray/phrase/middle_out.py
@@ -292,12 +292,20 @@ def __iter__(self):
     def __len__(self):
         return len(self.doc_ids)
 
+    def item_len(self, key):
+        if not self.base_item:
+            return 0
+        return self.base.item_len(key)
+
 
 class PosnBitArray:
 
-    def __init__(self, encoded_term_posns: Union[ArrayDict, FilteredPosns], max_doc_id: int):
+    def __init__(self, encoded_term_posns: Union[ArrayDict, FilteredPosns],
+                 max_doc_id: int,
+                 cache_at_len: int = 10):
         self.encoded_term_posns = encoded_term_posns
         self.max_doc_id = max_doc_id
+        self.cache_at_len = cache_at_len
         self.docfreq_cache : Dict[int, np.uint64] = {}
         self.termfreq_cache : Dict[int, Tuple[np.ndarray, np.ndarray]] = {}
 
@@ -470,6 +478,8 @@ def _termfreqs_with_cache(self, term_id: int) -> Tuple[np.ndarray, np.ndarray]:
             term_posns = self.encoded_term_posns[term_id]
             doc_ids, term_freqs = self._computed_term_freqs(term_posns)
             if self._is_cached(term_id):
+                doc_ids = doc_ids[term_freqs > 0]
+                term_freqs = term_freqs[term_freqs > 0]
                 self.termfreq_cache[term_id] = (doc_ids, term_freqs)
             return doc_ids, term_freqs
 
@@ -480,8 +490,9 @@ def _docfreq_from_cache(self, term_id: int) -> np.uint64:
         return self.docfreq_cache[term_id]
 
     def _maybe_cache_docfreq(self, term_id: int, docfreq: np.uint64):
-        if self.max_doc_id >= 99999 and docfreq > (self.max_doc_id // 100):
-            self.docfreq_cache[term_id] = docfreq
+        if self.encoded_term_posns:
+            if self.encoded_term_posns.item_len(term_id) > self.cache_at_len:
+                self.docfreq_cache[term_id] = docfreq
 
     def docfreq(self, term_id: int) -> np.uint64:
         try:

diff --git a/searcharray/postings.py b/searcharray/postings.py
@@ -432,6 +432,10 @@ def value_counts(
             counts = Counter(self[:])
         return pd.Series(counts)
 
+    def tf_cache_at(self, terms_len):
+        """Cache term frequencies at a given length."""
+        self.posns.cache_at_len = terms_len
+
     def __len__(self):
         len_rval = len(self.term_mat.rows)
         return len_rval

diff --git a/setup.py b/setup.py
@@ -37,7 +37,7 @@
     # For a discussion on single-sourcing the version across setup.py and the
     # project code, see
     # https://packaging.python.org/guides/single-sourcing-package-version/
-    version="0.0.64",  # Required
+    version="0.0.65",  # Required
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the "Summary" metadata field:
     # https://packaging.python.org/specifications/core-metadata/#summary