Skip to content

Commit

Permalink
Add term freq using packed posns
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Dec 19, 2023
1 parent ff69972 commit 257592a
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 2 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ test: deps
python -m pytest --benchmark-skip test


benchmark_dry_run: deps
python -m pytest -x --benchmark-only


benchmark: deps
python -m pytest -x --benchmark-only --benchmark-autosave --benchmark-histogram=./.benchmarks/histogram
open ./.benchmarks/histogram.svg
Expand Down
14 changes: 14 additions & 0 deletions searcharray/phrase/middle_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import logging
from time import perf_counter

from searcharray.utils.bitcount import bit_count64


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -325,6 +327,18 @@ def positions(self, term_id: int, key) -> List:
decs = decs[0]
return decs

def termfreqs(self, term_id: int) -> np.ndarray:
"""Count term freqs using unique positions."""
encoded = self.encoded_term_posns[term_id]
doc_ids = (encoded & DOC_ID_MASK) >> (64 - DOC_ID_BITS)
change_indices = np.nonzero(np.diff(doc_ids))[0]
change_indices = np.concatenate(([0], change_indices + 1))
posns = encoded & POSN_LSB_MASK
bit_counts = bit_count64(posns)

term_freqs = np.add.reduceat(bit_counts, change_indices)
return np.unique(doc_ids), term_freqs

def insert(self, key, term_ids_to_posns, is_encoded=False):
new_posns = PosnBitArrayBuilder()
if is_encoded:
Expand Down
5 changes: 3 additions & 2 deletions searcharray/postings.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,9 @@ def term_freq(self, token):

try:
term_id = self.term_dict.get_term_id(token)
matches = self.term_freqs.copy_col_at(term_id).todense().flatten()
matches = np.asarray(matches).flatten()
matches = np.zeros(len(self), dtype=int)
doc_ids, termfreqs = self.posns.termfreqs(term_id)
matches[doc_ids] = termfreqs
return matches
except TermMissingError:
return np.zeros(len(self), dtype=int)
Expand Down
30 changes: 30 additions & 0 deletions searcharray/utils/bitcount.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Naive popcount implementation until such time that's exposed in numpy (SOON!)."""
import numpy as np


m1 = np.uint64(0x5555555555555555)
m2 = np.uint64(0x3333333333333333)
m3 = np.uint64(0x0F0F0F0F0F0F0F0F)
m4 = np.uint64(0x0101010101010101)


mask = np.uint64(-1)
# TODO - precompute type specific hashes
s55 = np.uint64(m1 & mask) # Add more digits for 128bit support
s33 = np.uint64(m2 & mask)
s0F = np.uint64(m3 & mask)
s01 = np.uint64(m4 & mask)
num_bytes_64 = 8


def bit_count64(arr):
"""Count the number of bits set in each element in the array."""
arr = arr - ((arr >> 1) & s55)
arr = (arr & s33) + ((arr >> 2) & s33)

arr += (arr >> 4)
arr &= s0F
arr *= s01
arr >>= (8 * (num_bytes_64 - 1))

return arr

0 comments on commit 257592a

Please sign in to comment.