Add sorted adjacenct function

softwaredoug · Mar 13, 2024 · ba91a31 · ba91a31
1 parent 3cb903c
commit ba91a31
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 5 deletions.
diff --git a/searcharray/utils/roaringish.py b/searcharray/utils/roaringish.py
@@ -8,7 +8,7 @@
 import numbers
 from typing import Optional, Tuple, List, Union
 
-from searcharray.utils.snp_ops import intersect, unique
+from searcharray.utils.snp_ops import intersect, unique, adjacent
 
 logger = logging.getLogger(__name__)
 
@@ -205,9 +205,10 @@ def intersect_rshift(self, lhs: np.ndarray, rhs: np.ndarray,
         rhs : np.ndarray of uint64 (encoded) values
         rshift : int how much to shift rhs by to the right
         """
-        rhs_int, rhs_shifted = self._rhs_shifted(rhs, rshift)
-        _, lhs_idx, rhs_idx = intersect(lhs, rhs_shifted, mask=self.header_mask)
-        return lhs[lhs_idx], rhs_int[rhs_idx]
+        # print(lhs, rhs, self.header_mask)
+        # import pdb; pdb.set_trace()
+        lhs_idx, rhs_idx = adjacent(lhs, rhs, mask=self.header_mask)
+        return lhs[lhs_idx], rhs[rhs_idx]
 
     def intersect(self, lhs: np.ndarray, rhs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
         """Return the MSBs that are common to both lhs and rhs (same keys, same MSBs)

diff --git a/searcharray/utils/snp_ops.pyx b/searcharray/utils/snp_ops.pyx
@@ -117,6 +117,8 @@ cdef _intersection(DTYPE_t[:] lhs,
     cdef np.intp_t i_rhs = 0
     cdef np.intp_t i_result = 0
     cdef DTYPE_t value_prev = -1
+    cdef DTYPE_t value_lhs = 0
+    cdef DTYPE_t value_rhs = 0
 
     # Outputs as numpy arrays
     cdef np.uint64_t[:] results = np.empty(min(len_lhs, len_rhs), dtype=np.uint64)
@@ -194,6 +196,98 @@ def intersect(np.ndarray[DTYPE_t, ndim=1] lhs,
     # return _u64(result), _u64(indices_lhs), _u64(indices_rhs)
 
 
+cdef _adjacent(DTYPE_t[:] lhs,
+               DTYPE_t[:] rhs,
+               DTYPE_t mask=ALL_BITS,
+               DTYPE_t delta=1):
+    # Find all LHS / RHS indices where LHS is 1 before RHS
+    cdef np.intp_t len_lhs = lhs.shape[0]
+    cdef np.intp_t len_rhs = rhs.shape[0]
+    cdef np.intp_t i_lhs = 0
+    cdef np.intp_t i_rhs = 0
+    cdef np.intp_t i_result = 0
+    cdef DTYPE_t value_prev = -1
+    cdef DTYPE_t value_lhs = 0
+    cdef DTYPE_t value_rhs = 0
+
+    # Outputs as numpy arrays
+    cdef np.int64_t result_idx = 0
+    cdef np.uint64_t[:] lhs_indices = np.empty(min(len_lhs, len_rhs), dtype=np.uint64)
+    cdef np.uint64_t[:] rhs_indices = np.empty(min(len_lhs, len_rhs), dtype=np.uint64)
+
+    # Read rhs until > delta
+    # print(f"MASKED {mask} | {rhs[0]} | {i_rhs} - ", rhs[i_rhs] & mask)
+    while i_rhs < len_rhs and rhs[i_rhs] & mask == 0:
+        i_rhs += 1
+
+    while i_lhs < len_lhs and i_rhs < len_rhs:
+        # Use gallping search to find the first element in the right array
+        value_lhs = lhs[i_lhs] & mask
+        value_rhs = rhs[i_rhs] & mask
+
+        # print("=====================================")
+        # print(f"i_lhs: {i_lhs}, i_rhs: {i_rhs}")
+        # print(f"vals: {value_lhs:0x}, {value_rhs:0x} | {delta:0x}")
+
+        # Advance LHS to RHS
+        if value_lhs < value_rhs - delta:
+            # print(f"Advance lhs to rhs: {value_lhs} -> {value_rhs}-{delta}")
+            if i_lhs >= len_lhs - 1:
+                # print("EXIT (exhaust lhs)")
+                break
+            i_result = i_lhs
+            # lhs   0_  2*  2   lhs / rhs are at _, now advance to *
+            # rhs   0   3_  3
+            # Advance lhs to the 
+            _galloping_search(lhs, value_rhs - delta, mask, &i_result, len_lhs)
+            value_lhs = lhs[i_result] & mask
+            # print(f"search - i_result: {i_result}, value_lhs: {value_lhs}")
+            i_lhs = i_result
+        # Advance RHS to LHS
+        elif value_rhs - delta < value_lhs:
+            if i_rhs >= len_rhs - 1:
+                # print("EXIT (exhaust rhs)")
+                break
+            # print(f"Advance rhs to lhs: {value_rhs} | {value_rhs-delta} -> {value_lhs} | {i_result} {len_rhs}")
+            i_result = i_rhs
+            # lhs   0    2_   2   lhs / rhs are at _, now advance to *
+            # rhs   0_   3*   3    so that rhs is one past lhs
+            _galloping_search(rhs, value_lhs + delta,
+                              mask, &i_result, len_rhs)
+            value_rhs = rhs[i_result] & mask
+            # print(f"search - i_result: {i_result}, value_rhs: {value_rhs}")
+            i_rhs = i_result
+
+        if value_lhs == value_rhs - delta:
+            if value_prev != value_lhs:
+                # Not a dup so store it.
+                # print(f"Store: i_lhs:{i_lhs} | i_rhs:{i_rhs} | val_lhs:{lhs[i_lhs]} | val_rhs:{rhs[i_rhs]}")
+                lhs_indices[result_idx] = i_lhs
+                rhs_indices[result_idx] = i_rhs
+                result_idx += 1
+            value_prev = value_lhs
+            i_lhs += 1
+            i_rhs += 1
+
+    # Get view of each result and return
+    return np.asarray(lhs_indices), np.asarray(rhs_indices), result_idx
+
+
+def adjacent(np.ndarray[DTYPE_t, ndim=1] lhs,
+             np.ndarray[DTYPE_t, ndim=1] rhs,
+             DTYPE_t mask=ALL_BITS):
+    if mask == 0:
+        raise ValueError("Mask cannot be zero")
+    if mask is None:
+        mask = ALL_BITS
+        delta = 1
+    else:
+        delta = mask & -mask  # lest significant set bit on mask
+
+    indices_lhs, indices_rhs, result_idx = _adjacent(lhs, rhs, mask, delta)
+    return indices_lhs[:result_idx], indices_rhs[:result_idx]
+
+
 cdef _scan_unique(DTYPE_t[:] arr,
                   DTYPE_t arr_len):
     cdef DTYPE_t i = 0

diff --git a/test/test_snp_ops.py b/test/test_snp_ops.py
@@ -2,7 +2,7 @@
 import numpy as np
 import sortednp as snp
 import pytest
-from searcharray.utils.snp_ops import binary_search, galloping_search, intersect, unique
+from searcharray.utils.snp_ops import binary_search, galloping_search, intersect, adjacent, unique
 from test_utils import w_scenarios
 from test_utils import Profiler, profile_enabled
 
@@ -222,3 +222,65 @@ def unique_many():
 
     profiler = Profiler(benchmark)
     profiler.run(unique_many)
+
+
+adj_scenarios = {
+    "base": {
+        "lhs": u64([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+        "rhs": u64([2, 4, 6, 8, 10]),
+        "mask": None,
+        "lhs_idx_expected": u64arr([0, 2, 4, 6, 8]),
+        "rhs_idx_expected": u64arr([0, 1, 2, 3, 4])
+    },
+    "start_0s": {
+        "lhs": u64([0, 0, 3]),
+        "rhs": u64([0, 0, 1, 8, 10]),
+        "mask": None,
+        "lhs_idx_expected": u64arr([0]),
+        "rhs_idx_expected": u64arr([2])
+    },
+    "start_0s_2": {
+        "lhs": u64([0, 2, 3]),
+        "rhs": u64([0, 1, 2, 8, 10]),
+        "mask": None,
+        "lhs_idx_expected": u64arr([0]),
+        "rhs_idx_expected": u64arr([1])
+    },
+    "base_masked": {
+        "lhs": u64([0x1F, 0x2F, 0x3F, 0x4F, 0x5F, 0x6F, 0x7F, 0x8F, 0x9F, 0xAF]),
+        "rhs": u64([0x2F, 0x4F, 0x6F, 0x8F, 0xAF]),
+        "mask": 0xF0,
+        "lhs_idx_expected": u64arr([0, 2, 4, 6, 8]),
+        "rhs_idx_expected": u64arr([0, 1, 2, 3, 4])
+    },
+    "rhs_0": {
+        "lhs": u64([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+        "rhs": u64([0, 3]),
+        "mask": None,
+        "lhs_idx_expected": u64arr([1]),
+        "rhs_idx_expected": u64arr([1])
+    },
+    "many_adjacent": {
+        "lhs": u64([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]),
+        "rhs": u64([2, 3]),
+        "mask": None,
+        "lhs_idx_expected": u64arr([0, 5]),  # As we drop dups
+        "rhs_idx_expected": u64arr([0, 1])
+    },
+    "trouble_scen": {
+        "lhs": u64([1, 274877906945, 549755813889, 824633720833]),
+        "rhs": u64([6, 137438953474, 274877906950, 412316860418]),
+        "mask": 0xfffffffffffc0000,
+        "lhs_expected": u64arr([]),
+        "rhs_expected": u64arr([])
+    }
+}
+
+
+@w_scenarios(adj_scenarios)
+def test_adjacent(lhs, rhs, mask, lhs_idx_expected, rhs_idx_expected):
+    if mask is None:
+        mask = np.uint64(0xFFFFFFFFFFFFFFFF)
+    lhs_idx, rhs_idx = adjacent(lhs, rhs, mask)
+    assert np.all(lhs_idx == lhs_idx_expected)
+    assert np.all(rhs_idx == rhs_idx_expected)