From 098bee98b5b879ab5e5903d77632ab1f72df37c5 Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Wed, 26 Jun 2024 21:20:09 -0400 Subject: [PATCH] Duplicate spans to allow other combinations --- searcharray/roaringish/spans.pyx | 31 +++++++++++++++++++++++++++---- test/test_phrase_matches.py | 6 +----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/searcharray/roaringish/spans.pyx b/searcharray/roaringish/spans.pyx index 726f86d..ffb28fe 100644 --- a/searcharray/roaringish/spans.pyx +++ b/searcharray/roaringish/spans.pyx @@ -191,6 +191,9 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array term &= payload_mask curr_term_mask = 0x1 << term_ord + print("***") + print(f"Term {term_ord} {curr_key} {term:b} {payload_base} slop:{slop} max_span_width:{max_span_width}") + # Consume every position into every possible span while term != 0: set_idx = _consume_lsb(&term) @@ -201,9 +204,13 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array spans.posns[spans.cursor] = posn_mask spans.beg[spans.cursor] = curr_posn spans.end[spans.cursor] = curr_posn + print(f"ADDING AT {spans.cursor} {curr_posn}") + _print_span(&spans, spans.cursor) # Update existing spans + print("UPDATING") end = spans.cursor + spans.cursor += 1 for span_idx in range(end): # Continue active spans num_terms_visited = _num_terms(&spans, span_idx) @@ -218,22 +225,37 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array spans.posns[span_idx] |= posn_mask new_unique_posns = _num_posns(&spans, span_idx) + print(f"{span_idx}: set_idx + payload_base {set_idx + payload_base} spans.beg[span_idx] {spans.beg[span_idx]}") proposed_width = abs(curr_posn - spans.beg[span_idx]) if (num_posns_visited == new_unique_posns) or proposed_width > max_span_width: # Clear curr_term_mask and cancel this position, we've seen it before + print(f"{span_idx}: Canceling posn -- {proposed_width} {max_span_width} | num_posns_visited:{num_posns_visited} new_unique_posns:{new_unique_posns}") + _print_span(&spans, span_idx) spans.terms[span_idx] &= ~curr_term_mask continue - + if spans.cursor < 128: + print(f"DUPLICATED at end!") + spans.terms[spans.cursor] = spans.terms[span_idx] + spans.posns[spans.cursor] = (spans.posns[span_idx] & ~posn_mask) + spans.beg[spans.cursor] = spans.beg[span_idx] + spans.end[spans.cursor] = spans.end[span_idx] + _print_span(&spans, spans.cursor) + spans.cursor += 1 + else: + assert False, "FULL!" + print("FULL!") + + print(f"{span_idx}: Before update {curr_posn}") + _print_span(&spans, span_idx) spans.end[span_idx] = curr_posn + print(f"{span_idx}: Updated span w/ posn {curr_posn}") + _print_span(&spans, span_idx) span_width = _span_width(&spans, span_idx) if span_width > max_span_width: continue - if spans.cursor >= 128: - break if spans.cursor >= 128: break - spans.cursor += 1 last_set_idx = set_idx curr_idx[term_ord] += 1 if curr_idx[term_ord] < lengths[term_ord+1]: @@ -255,6 +277,7 @@ cdef _span_freqs(DTYPE_t[:] posns, # Flattened all terms in one array # All terms consumed for doc collected_spans = _collect_spans(&spans, num_terms, max_span_width) phrase_freqs[last_key] += collected_spans.cursor + print(f"Doc {last_key} {collected_spans.cursor}") # Reset spans = _new_active_spans() diff --git a/test/test_phrase_matches.py b/test/test_phrase_matches.py index d0b6aa5..7ee31ce 100644 --- a/test/test_phrase_matches.py +++ b/test/test_phrase_matches.py @@ -219,11 +219,7 @@ def assert_higher_slop_matches(docs, phrase, matches): missing = list(set(last_scores_idx) - set(scores_idx)) assert np.all(np.isin(scores_idx, last_scores_idx)), f"Slop {slop} not subset of {slop - 1} slop -- missing: {missing[0:10]}..." where_lt = np.argwhere(scores < last_scores).flatten() - try: - assert len(where_lt) == 0, f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}" - except AssertionError as e: - print(f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}") - warnings.warn(str(e)) + assert len(where_lt) == 0, f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}" @w_scenarios(scenarios)