diff --git a/searcharray/utils/roaringish.py b/searcharray/utils/roaringish.py index 1c4a4c9..c50d269 100644 --- a/searcharray/utils/roaringish.py +++ b/searcharray/utils/roaringish.py @@ -8,7 +8,7 @@ import numbers from typing import Optional, Tuple, List, Union -from searcharray.utils.snp_ops import intersect, unique +from searcharray.utils.snp_ops import intersect, unique, adjacent logger = logging.getLogger(__name__) @@ -205,9 +205,10 @@ def intersect_rshift(self, lhs: np.ndarray, rhs: np.ndarray, rhs : np.ndarray of uint64 (encoded) values rshift : int how much to shift rhs by to the right """ - rhs_int, rhs_shifted = self._rhs_shifted(rhs, rshift) - _, lhs_idx, rhs_idx = intersect(lhs, rhs_shifted, mask=self.header_mask) - return lhs[lhs_idx], rhs_int[rhs_idx] + # print(lhs, rhs, self.header_mask) + # import pdb; pdb.set_trace() + lhs_idx, rhs_idx = adjacent(lhs, rhs, mask=self.header_mask) + return lhs[lhs_idx], rhs[rhs_idx] def intersect(self, lhs: np.ndarray, rhs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Return the MSBs that are common to both lhs and rhs (same keys, same MSBs) diff --git a/searcharray/utils/snp_ops.pyx b/searcharray/utils/snp_ops.pyx index 6b93b3f..727c6be 100644 --- a/searcharray/utils/snp_ops.pyx +++ b/searcharray/utils/snp_ops.pyx @@ -117,6 +117,8 @@ cdef _intersection(DTYPE_t[:] lhs, cdef np.intp_t i_rhs = 0 cdef np.intp_t i_result = 0 cdef DTYPE_t value_prev = -1 + cdef DTYPE_t value_lhs = 0 + cdef DTYPE_t value_rhs = 0 # Outputs as numpy arrays cdef np.uint64_t[:] results = np.empty(min(len_lhs, len_rhs), dtype=np.uint64) @@ -194,6 +196,98 @@ def intersect(np.ndarray[DTYPE_t, ndim=1] lhs, # return _u64(result), _u64(indices_lhs), _u64(indices_rhs) +cdef _adjacent(DTYPE_t[:] lhs, + DTYPE_t[:] rhs, + DTYPE_t mask=ALL_BITS, + DTYPE_t delta=1): + # Find all LHS / RHS indices where LHS is 1 before RHS + cdef np.intp_t len_lhs = lhs.shape[0] + cdef np.intp_t len_rhs = rhs.shape[0] + cdef np.intp_t i_lhs = 0 + cdef np.intp_t i_rhs = 0 + cdef np.intp_t i_result = 0 + cdef DTYPE_t value_prev = -1 + cdef DTYPE_t value_lhs = 0 + cdef DTYPE_t value_rhs = 0 + + # Outputs as numpy arrays + cdef np.int64_t result_idx = 0 + cdef np.uint64_t[:] lhs_indices = np.empty(min(len_lhs, len_rhs), dtype=np.uint64) + cdef np.uint64_t[:] rhs_indices = np.empty(min(len_lhs, len_rhs), dtype=np.uint64) + + # Read rhs until > delta + # print(f"MASKED {mask} | {rhs[0]} | {i_rhs} - ", rhs[i_rhs] & mask) + while i_rhs < len_rhs and rhs[i_rhs] & mask == 0: + i_rhs += 1 + + while i_lhs < len_lhs and i_rhs < len_rhs: + # Use gallping search to find the first element in the right array + value_lhs = lhs[i_lhs] & mask + value_rhs = rhs[i_rhs] & mask + + # print("=====================================") + # print(f"i_lhs: {i_lhs}, i_rhs: {i_rhs}") + # print(f"vals: {value_lhs:0x}, {value_rhs:0x} | {delta:0x}") + + # Advance LHS to RHS + if value_lhs < value_rhs - delta: + # print(f"Advance lhs to rhs: {value_lhs} -> {value_rhs}-{delta}") + if i_lhs >= len_lhs - 1: + # print("EXIT (exhaust lhs)") + break + i_result = i_lhs + # lhs 0_ 2* 2 lhs / rhs are at _, now advance to * + # rhs 0 3_ 3 + # Advance lhs to the + _galloping_search(lhs, value_rhs - delta, mask, &i_result, len_lhs) + value_lhs = lhs[i_result] & mask + # print(f"search - i_result: {i_result}, value_lhs: {value_lhs}") + i_lhs = i_result + # Advance RHS to LHS + elif value_rhs - delta < value_lhs: + if i_rhs >= len_rhs - 1: + # print("EXIT (exhaust rhs)") + break + # print(f"Advance rhs to lhs: {value_rhs} | {value_rhs-delta} -> {value_lhs} | {i_result} {len_rhs}") + i_result = i_rhs + # lhs 0 2_ 2 lhs / rhs are at _, now advance to * + # rhs 0_ 3* 3 so that rhs is one past lhs + _galloping_search(rhs, value_lhs + delta, + mask, &i_result, len_rhs) + value_rhs = rhs[i_result] & mask + # print(f"search - i_result: {i_result}, value_rhs: {value_rhs}") + i_rhs = i_result + + if value_lhs == value_rhs - delta: + if value_prev != value_lhs: + # Not a dup so store it. + # print(f"Store: i_lhs:{i_lhs} | i_rhs:{i_rhs} | val_lhs:{lhs[i_lhs]} | val_rhs:{rhs[i_rhs]}") + lhs_indices[result_idx] = i_lhs + rhs_indices[result_idx] = i_rhs + result_idx += 1 + value_prev = value_lhs + i_lhs += 1 + i_rhs += 1 + + # Get view of each result and return + return np.asarray(lhs_indices), np.asarray(rhs_indices), result_idx + + +def adjacent(np.ndarray[DTYPE_t, ndim=1] lhs, + np.ndarray[DTYPE_t, ndim=1] rhs, + DTYPE_t mask=ALL_BITS): + if mask == 0: + raise ValueError("Mask cannot be zero") + if mask is None: + mask = ALL_BITS + delta = 1 + else: + delta = mask & -mask # lest significant set bit on mask + + indices_lhs, indices_rhs, result_idx = _adjacent(lhs, rhs, mask, delta) + return indices_lhs[:result_idx], indices_rhs[:result_idx] + + cdef _scan_unique(DTYPE_t[:] arr, DTYPE_t arr_len): cdef DTYPE_t i = 0 diff --git a/test/test_snp_ops.py b/test/test_snp_ops.py index 64ca4bb..11008d6 100644 --- a/test/test_snp_ops.py +++ b/test/test_snp_ops.py @@ -2,7 +2,7 @@ import numpy as np import sortednp as snp import pytest -from searcharray.utils.snp_ops import binary_search, galloping_search, intersect, unique +from searcharray.utils.snp_ops import binary_search, galloping_search, intersect, adjacent, unique from test_utils import w_scenarios from test_utils import Profiler, profile_enabled @@ -222,3 +222,65 @@ def unique_many(): profiler = Profiler(benchmark) profiler.run(unique_many) + + +adj_scenarios = { + "base": { + "lhs": u64([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + "rhs": u64([2, 4, 6, 8, 10]), + "mask": None, + "lhs_idx_expected": u64arr([0, 2, 4, 6, 8]), + "rhs_idx_expected": u64arr([0, 1, 2, 3, 4]) + }, + "start_0s": { + "lhs": u64([0, 0, 3]), + "rhs": u64([0, 0, 1, 8, 10]), + "mask": None, + "lhs_idx_expected": u64arr([0]), + "rhs_idx_expected": u64arr([2]) + }, + "start_0s_2": { + "lhs": u64([0, 2, 3]), + "rhs": u64([0, 1, 2, 8, 10]), + "mask": None, + "lhs_idx_expected": u64arr([0]), + "rhs_idx_expected": u64arr([1]) + }, + "base_masked": { + "lhs": u64([0x1F, 0x2F, 0x3F, 0x4F, 0x5F, 0x6F, 0x7F, 0x8F, 0x9F, 0xAF]), + "rhs": u64([0x2F, 0x4F, 0x6F, 0x8F, 0xAF]), + "mask": 0xF0, + "lhs_idx_expected": u64arr([0, 2, 4, 6, 8]), + "rhs_idx_expected": u64arr([0, 1, 2, 3, 4]) + }, + "rhs_0": { + "lhs": u64([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + "rhs": u64([0, 3]), + "mask": None, + "lhs_idx_expected": u64arr([1]), + "rhs_idx_expected": u64arr([1]) + }, + "many_adjacent": { + "lhs": u64([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]), + "rhs": u64([2, 3]), + "mask": None, + "lhs_idx_expected": u64arr([0, 5]), # As we drop dups + "rhs_idx_expected": u64arr([0, 1]) + }, + "trouble_scen": { + "lhs": u64([1, 274877906945, 549755813889, 824633720833]), + "rhs": u64([6, 137438953474, 274877906950, 412316860418]), + "mask": 0xfffffffffffc0000, + "lhs_expected": u64arr([]), + "rhs_expected": u64arr([]) + } +} + + +@w_scenarios(adj_scenarios) +def test_adjacent(lhs, rhs, mask, lhs_idx_expected, rhs_idx_expected): + if mask is None: + mask = np.uint64(0xFFFFFFFFFFFFFFFF) + lhs_idx, rhs_idx = adjacent(lhs, rhs, mask) + assert np.all(lhs_idx == lhs_idx_expected) + assert np.all(rhs_idx == rhs_idx_expected)