Skip to content

Commit

Permalink
Add sorted adjacenct function
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Mar 13, 2024
1 parent 3cb903c commit ba91a31
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 5 deletions.
9 changes: 5 additions & 4 deletions searcharray/utils/roaringish.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numbers
from typing import Optional, Tuple, List, Union

from searcharray.utils.snp_ops import intersect, unique
from searcharray.utils.snp_ops import intersect, unique, adjacent

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -205,9 +205,10 @@ def intersect_rshift(self, lhs: np.ndarray, rhs: np.ndarray,
rhs : np.ndarray of uint64 (encoded) values
rshift : int how much to shift rhs by to the right
"""
rhs_int, rhs_shifted = self._rhs_shifted(rhs, rshift)
_, lhs_idx, rhs_idx = intersect(lhs, rhs_shifted, mask=self.header_mask)
return lhs[lhs_idx], rhs_int[rhs_idx]
# print(lhs, rhs, self.header_mask)
# import pdb; pdb.set_trace()
lhs_idx, rhs_idx = adjacent(lhs, rhs, mask=self.header_mask)
return lhs[lhs_idx], rhs[rhs_idx]

def intersect(self, lhs: np.ndarray, rhs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Return the MSBs that are common to both lhs and rhs (same keys, same MSBs)
Expand Down
94 changes: 94 additions & 0 deletions searcharray/utils/snp_ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ cdef _intersection(DTYPE_t[:] lhs,
cdef np.intp_t i_rhs = 0
cdef np.intp_t i_result = 0
cdef DTYPE_t value_prev = -1
cdef DTYPE_t value_lhs = 0
cdef DTYPE_t value_rhs = 0

# Outputs as numpy arrays
cdef np.uint64_t[:] results = np.empty(min(len_lhs, len_rhs), dtype=np.uint64)
Expand Down Expand Up @@ -194,6 +196,98 @@ def intersect(np.ndarray[DTYPE_t, ndim=1] lhs,
# return _u64(result), _u64(indices_lhs), _u64(indices_rhs)


cdef _adjacent(DTYPE_t[:] lhs,
DTYPE_t[:] rhs,
DTYPE_t mask=ALL_BITS,
DTYPE_t delta=1):
# Find all LHS / RHS indices where LHS is 1 before RHS
cdef np.intp_t len_lhs = lhs.shape[0]
cdef np.intp_t len_rhs = rhs.shape[0]
cdef np.intp_t i_lhs = 0
cdef np.intp_t i_rhs = 0
cdef np.intp_t i_result = 0
cdef DTYPE_t value_prev = -1
cdef DTYPE_t value_lhs = 0
cdef DTYPE_t value_rhs = 0

# Outputs as numpy arrays
cdef np.int64_t result_idx = 0
cdef np.uint64_t[:] lhs_indices = np.empty(min(len_lhs, len_rhs), dtype=np.uint64)
cdef np.uint64_t[:] rhs_indices = np.empty(min(len_lhs, len_rhs), dtype=np.uint64)

# Read rhs until > delta
# print(f"MASKED {mask} | {rhs[0]} | {i_rhs} - ", rhs[i_rhs] & mask)
while i_rhs < len_rhs and rhs[i_rhs] & mask == 0:
i_rhs += 1

while i_lhs < len_lhs and i_rhs < len_rhs:
# Use gallping search to find the first element in the right array
value_lhs = lhs[i_lhs] & mask
value_rhs = rhs[i_rhs] & mask

# print("=====================================")
# print(f"i_lhs: {i_lhs}, i_rhs: {i_rhs}")
# print(f"vals: {value_lhs:0x}, {value_rhs:0x} | {delta:0x}")

# Advance LHS to RHS
if value_lhs < value_rhs - delta:
# print(f"Advance lhs to rhs: {value_lhs} -> {value_rhs}-{delta}")
if i_lhs >= len_lhs - 1:
# print("EXIT (exhaust lhs)")
break
i_result = i_lhs
# lhs 0_ 2* 2 lhs / rhs are at _, now advance to *
# rhs 0 3_ 3
# Advance lhs to the
_galloping_search(lhs, value_rhs - delta, mask, &i_result, len_lhs)
value_lhs = lhs[i_result] & mask
# print(f"search - i_result: {i_result}, value_lhs: {value_lhs}")
i_lhs = i_result
# Advance RHS to LHS
elif value_rhs - delta < value_lhs:
if i_rhs >= len_rhs - 1:
# print("EXIT (exhaust rhs)")
break
# print(f"Advance rhs to lhs: {value_rhs} | {value_rhs-delta} -> {value_lhs} | {i_result} {len_rhs}")
i_result = i_rhs
# lhs 0 2_ 2 lhs / rhs are at _, now advance to *
# rhs 0_ 3* 3 so that rhs is one past lhs
_galloping_search(rhs, value_lhs + delta,
mask, &i_result, len_rhs)
value_rhs = rhs[i_result] & mask
# print(f"search - i_result: {i_result}, value_rhs: {value_rhs}")
i_rhs = i_result

if value_lhs == value_rhs - delta:
if value_prev != value_lhs:
# Not a dup so store it.
# print(f"Store: i_lhs:{i_lhs} | i_rhs:{i_rhs} | val_lhs:{lhs[i_lhs]} | val_rhs:{rhs[i_rhs]}")
lhs_indices[result_idx] = i_lhs
rhs_indices[result_idx] = i_rhs
result_idx += 1
value_prev = value_lhs
i_lhs += 1
i_rhs += 1

# Get view of each result and return
return np.asarray(lhs_indices), np.asarray(rhs_indices), result_idx


def adjacent(np.ndarray[DTYPE_t, ndim=1] lhs,
np.ndarray[DTYPE_t, ndim=1] rhs,
DTYPE_t mask=ALL_BITS):
if mask == 0:
raise ValueError("Mask cannot be zero")
if mask is None:
mask = ALL_BITS
delta = 1
else:
delta = mask & -mask # lest significant set bit on mask

indices_lhs, indices_rhs, result_idx = _adjacent(lhs, rhs, mask, delta)
return indices_lhs[:result_idx], indices_rhs[:result_idx]


cdef _scan_unique(DTYPE_t[:] arr,
DTYPE_t arr_len):
cdef DTYPE_t i = 0
Expand Down
64 changes: 63 additions & 1 deletion test/test_snp_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import sortednp as snp
import pytest
from searcharray.utils.snp_ops import binary_search, galloping_search, intersect, unique
from searcharray.utils.snp_ops import binary_search, galloping_search, intersect, adjacent, unique
from test_utils import w_scenarios
from test_utils import Profiler, profile_enabled

Expand Down Expand Up @@ -222,3 +222,65 @@ def unique_many():

profiler = Profiler(benchmark)
profiler.run(unique_many)


adj_scenarios = {
"base": {
"lhs": u64([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
"rhs": u64([2, 4, 6, 8, 10]),
"mask": None,
"lhs_idx_expected": u64arr([0, 2, 4, 6, 8]),
"rhs_idx_expected": u64arr([0, 1, 2, 3, 4])
},
"start_0s": {
"lhs": u64([0, 0, 3]),
"rhs": u64([0, 0, 1, 8, 10]),
"mask": None,
"lhs_idx_expected": u64arr([0]),
"rhs_idx_expected": u64arr([2])
},
"start_0s_2": {
"lhs": u64([0, 2, 3]),
"rhs": u64([0, 1, 2, 8, 10]),
"mask": None,
"lhs_idx_expected": u64arr([0]),
"rhs_idx_expected": u64arr([1])
},
"base_masked": {
"lhs": u64([0x1F, 0x2F, 0x3F, 0x4F, 0x5F, 0x6F, 0x7F, 0x8F, 0x9F, 0xAF]),
"rhs": u64([0x2F, 0x4F, 0x6F, 0x8F, 0xAF]),
"mask": 0xF0,
"lhs_idx_expected": u64arr([0, 2, 4, 6, 8]),
"rhs_idx_expected": u64arr([0, 1, 2, 3, 4])
},
"rhs_0": {
"lhs": u64([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
"rhs": u64([0, 3]),
"mask": None,
"lhs_idx_expected": u64arr([1]),
"rhs_idx_expected": u64arr([1])
},
"many_adjacent": {
"lhs": u64([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]),
"rhs": u64([2, 3]),
"mask": None,
"lhs_idx_expected": u64arr([0, 5]), # As we drop dups
"rhs_idx_expected": u64arr([0, 1])
},
"trouble_scen": {
"lhs": u64([1, 274877906945, 549755813889, 824633720833]),
"rhs": u64([6, 137438953474, 274877906950, 412316860418]),
"mask": 0xfffffffffffc0000,
"lhs_expected": u64arr([]),
"rhs_expected": u64arr([])
}
}


@w_scenarios(adj_scenarios)
def test_adjacent(lhs, rhs, mask, lhs_idx_expected, rhs_idx_expected):
if mask is None:
mask = np.uint64(0xFFFFFFFFFFFFFFFF)
lhs_idx, rhs_idx = adjacent(lhs, rhs, mask)
assert np.all(lhs_idx == lhs_idx_expected)
assert np.all(rhs_idx == rhs_idx_expected)

0 comments on commit ba91a31

Please sign in to comment.