Skip to content

Commit

Permalink
Duplicate spans to allow other combinations
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jun 27, 2024
1 parent 73af082 commit 098bee9
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 9 deletions.
31 changes: 27 additions & 4 deletions searcharray/roaringish/spans.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
term &= payload_mask
curr_term_mask = 0x1 << term_ord

print("***")
print(f"Term {term_ord} {curr_key} {term:b} {payload_base} slop:{slop} max_span_width:{max_span_width}")

# Consume every position into every possible span
while term != 0:
set_idx = _consume_lsb(&term)
Expand All @@ -201,9 +204,13 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
spans.posns[spans.cursor] = posn_mask
spans.beg[spans.cursor] = curr_posn
spans.end[spans.cursor] = curr_posn
print(f"ADDING AT {spans.cursor} {curr_posn}")
_print_span(&spans, spans.cursor)

# Update existing spans
print("UPDATING")
end = spans.cursor
spans.cursor += 1
for span_idx in range(end):
# Continue active spans
num_terms_visited = _num_terms(&spans, span_idx)
Expand All @@ -218,22 +225,37 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
spans.posns[span_idx] |= posn_mask

new_unique_posns = _num_posns(&spans, span_idx)
print(f"{span_idx}: set_idx + payload_base {set_idx + payload_base} spans.beg[span_idx] {spans.beg[span_idx]}")
proposed_width = abs(curr_posn - spans.beg[span_idx])
if (num_posns_visited == new_unique_posns) or proposed_width > max_span_width:
# Clear curr_term_mask and cancel this position, we've seen it before
print(f"{span_idx}: Canceling posn -- {proposed_width} {max_span_width} | num_posns_visited:{num_posns_visited} new_unique_posns:{new_unique_posns}")
_print_span(&spans, span_idx)
spans.terms[span_idx] &= ~curr_term_mask
continue

if spans.cursor < 128:
print(f"DUPLICATED at end!")
spans.terms[spans.cursor] = spans.terms[span_idx]
spans.posns[spans.cursor] = (spans.posns[span_idx] & ~posn_mask)
spans.beg[spans.cursor] = spans.beg[span_idx]
spans.end[spans.cursor] = spans.end[span_idx]
_print_span(&spans, spans.cursor)
spans.cursor += 1
else:
assert False, "FULL!"
print("FULL!")

print(f"{span_idx}: Before update {curr_posn}")
_print_span(&spans, span_idx)
spans.end[span_idx] = curr_posn
print(f"{span_idx}: Updated span w/ posn {curr_posn}")
_print_span(&spans, span_idx)
span_width = _span_width(&spans, span_idx)
if span_width > max_span_width:
continue
if spans.cursor >= 128:
break

if spans.cursor >= 128:
break
spans.cursor += 1
last_set_idx = set_idx
curr_idx[term_ord] += 1
if curr_idx[term_ord] < lengths[term_ord+1]:
Expand All @@ -255,6 +277,7 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array
# All terms consumed for doc
collected_spans = _collect_spans(&spans, num_terms, max_span_width)
phrase_freqs[last_key] += collected_spans.cursor
print(f"Doc {last_key} {collected_spans.cursor}")

# Reset
spans = _new_active_spans()
Expand Down
6 changes: 1 addition & 5 deletions test/test_phrase_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,7 @@ def assert_higher_slop_matches(docs, phrase, matches):
missing = list(set(last_scores_idx) - set(scores_idx))
assert np.all(np.isin(scores_idx, last_scores_idx)), f"Slop {slop} not subset of {slop - 1} slop -- missing: {missing[0:10]}..."
where_lt = np.argwhere(scores < last_scores).flatten()
try:
assert len(where_lt) == 0, f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}"
except AssertionError as e:
print(f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}")
warnings.warn(str(e))
assert len(where_lt) == 0, f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}"


@w_scenarios(scenarios)
Expand Down

0 comments on commit 098bee9

Please sign in to comment.