Skip to content

Commit

Permalink
Termfreq miscounted when looping and gathering popcount
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jul 30, 2024
1 parent 5612373 commit fe84dc5
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 7 deletions.
18 changes: 12 additions & 6 deletions searcharray/roaringish/popcount.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -130,26 +130,32 @@ cdef _popcount64_reduce(DTYPE_t[:] arr,
DTYPE_t value_mask):
cdef float[:] popcounts = np.zeros(arr.shape[0], dtype=np.float32)
cdef DTYPE_t[:] keys = np.empty(arr.shape[0], dtype=np.uint64)
# cdef int i = 0
cdef float* popcounts_ptr = &popcounts[0]
cdef DTYPE_t* keys_ptr = &keys[0]
cdef DTYPE_t* arr_ptr = &arr[0]
cdef DTYPE_t last_key = 0xFFFFFFFFFFFFFFFF

cdef DTYPE_t last_key = arr_ptr[0] >> key_shift
keys_ptr[0] = last_key

for _ in range(arr.shape[0]):
popcounts_ptr[0] += __builtin_popcountll(arr_ptr[0] & value_mask)
if arr_ptr[0] >> key_shift != last_key:
if (arr_ptr[0] >> key_shift) == last_key:
popcounts_ptr[0] += __builtin_popcountll(arr_ptr[0] & value_mask)
else:
last_key = arr_ptr[0] >> key_shift
keys_ptr[0] = last_key
popcounts_ptr += 1
keys_ptr += 1
# Init next key
keys_ptr[0] = last_key
popcounts_ptr[0] = __builtin_popcountll(arr_ptr[0] & value_mask)
arr_ptr += 1
return keys, popcounts, keys_ptr - &keys[0]
return keys, popcounts, (keys_ptr - &keys[0] + 1)


def popcount64_reduce(arr,
key_shift,
value_mask):
cdef DTYPE_t[:] arr_view = arr
if len(arr_view) == 0:
return np.array([]), np.array([])
keys, popcounts, results_idx = _popcount64_reduce(arr_view, key_shift, value_mask)
return np.array(keys[:results_idx]), np.array(popcounts[:results_idx])
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# For a discussion on single-sourcing the version across setup.py and the
# project code, see
# https://packaging.python.org/guides/single-sourcing-package-version/
version="0.0.66", # Required
version="0.0.67", # Required
# This is a one-line description or tagline of what your project does. This
# corresponds to the "Summary" metadata field:
# https://packaging.python.org/specifications/core-metadata/#summary
Expand Down
21 changes: 21 additions & 0 deletions test/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,27 @@
DATA_DIR = '/tmp/tmdb'


def ws_lowercase(text):
return text.lower().split()


tf_scenarios = {
"base": {
"arr": SearchArray.index(["""bradford bradford""",
"""bradford""",
""""William Bradford (Mayflower passenger) William Bradford (1590 – 1657) was a passenger on the Mayflower in 1620. He travelled to the New World to live in religious freedom. He became the second Governor of Plymouth Colony and served for over 30 years. Bradford kept a journal of the history of the early life in Plymouth Colony. It is called Of Plymouth Plantation."""] * 25, tokenizer=ws_lowercase),
"term": "bradford",
"expected": [2, 1, 3] * 25,
}
}


@w_scenarios(tf_scenarios)
def test_term_freq(arr, term, expected):
tf = arr.termfreqs(term)
assert np.all(tf == expected)


@pytest.fixture
def data():
"""Return a fixture of your data here that returns an instance of your ExtensionArray."""
Expand Down

0 comments on commit fe84dc5

Please sign in to comment.