Skip to content

Commit

Permalink
Add faster unique keys
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Mar 13, 2024
1 parent e51f8d5 commit 3cb903c
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ __pycache__/
*.so
# Generated by Cython
*.c
*.html

# Benchmarks
.benchmarks
Expand Down
5 changes: 2 additions & 3 deletions searcharray/utils/roaringish.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,8 @@ def keys(self, encoded: np.ndarray) -> np.ndarray:

def keys_unique(self, encoded: np.ndarray) -> np.ndarray:
"""Return keys from encoded."""
keys = self.keys(encoded)
intersected = sorted_unique(keys)
return intersected
rshift = _64 - self.key_bits
return unique(encoded, rshift)

def payload_msb(self, encoded: np.ndarray) -> np.ndarray:
"""Return payload MSBs from encoded."""
Expand Down
21 changes: 12 additions & 9 deletions searcharray/utils/snp_ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -213,21 +213,21 @@ cdef _scan_unique(DTYPE_t[:] arr,
return result, result_idx


cdef _scan_unique_masked(DTYPE_t[:] arr,
DTYPE_t arr_len,
DTYPE_t mask):
cdef _scan_unique_shifted(DTYPE_t[:] arr,
DTYPE_t arr_len,
DTYPE_t rshift):
cdef DTYPE_t i = 0

cdef np.uint64_t[:] result = np.empty(arr_len, dtype=np.uint64)
cdef DTYPE_t result_idx = 0
cdef DTYPE_t target = arr[i]
cdef DTYPE_t target = arr[i] >> rshift

while i < arr_len:
target = arr[i]
target = arr[i] >> rshift
result[result_idx] = target
result_idx += 1
i += 1
while i < arr_len and (arr[i] & mask) == (target & mask):
while i < arr_len and (arr[i] >> rshift) == target:
i += 1

return result, result_idx
Expand All @@ -236,6 +236,9 @@ cdef _scan_unique_masked(DTYPE_t[:] arr,


def unique(np.ndarray[DTYPE_t, ndim=1] arr,
DTYPE_t mask=ALL_BITS):
result, result_idx = _scan_unique(arr, arr.shape[0])
return result[:result_idx]
DTYPE_t rshift=0):
if rshift > 0:
result, result_idx = _scan_unique_shifted(arr, arr.shape[0], rshift)
else:
result, result_idx = _scan_unique(arr, arr.shape[0])
return np.array(result[:result_idx])
7 changes: 7 additions & 0 deletions test/test_snp_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,13 @@ def test_unique(array):
assert np.all(result == expected)


@pytest.mark.parametrize("array,shift", [(u64([0xEE00, 0xFF00, 0xFF01]), 8)])
def test_unique_shifted(array, shift):
expected = np.unique(array >> shift)
result = unique(array, shift)
assert np.all(result == expected)


@pytest.mark.parametrize("seed", [0, 1, 2, 3, 4])
def test_unique_matches_snp(seed):
np.random.seed(seed)
Expand Down

0 comments on commit 3cb903c

Please sign in to comment.