From 098bee98b5b879ab5e5903d77632ab1f72df37c5 Mon Sep 17 00:00:00 2001
From: Doug Turnbull <softwaredoug@gmail.com>
Date: Wed, 26 Jun 2024 21:20:09 -0400
Subject: [PATCH] Duplicate spans to allow other combinations

---
 searcharray/roaringish/spans.pyx | 31 +++++++++++++++++++++++++++----
 test/test_phrase_matches.py      |  6 +-----
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/searcharray/roaringish/spans.pyx b/searcharray/roaringish/spans.pyx
index 726f86d..ffb28fe 100644
--- a/searcharray/roaringish/spans.pyx
+++ b/searcharray/roaringish/spans.pyx
@@ -191,6 +191,9 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
                 term &= payload_mask
                 curr_term_mask = 0x1 << term_ord
 
+                print("***")
+                print(f"Term {term_ord} {curr_key} {term:b} {payload_base} slop:{slop} max_span_width:{max_span_width}")
+
                 # Consume every position into every possible span
                 while term != 0:
                     set_idx = _consume_lsb(&term)
@@ -201,9 +204,13 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
                     spans.posns[spans.cursor] = posn_mask
                     spans.beg[spans.cursor] = curr_posn
                     spans.end[spans.cursor] = curr_posn
+                    print(f"ADDING AT {spans.cursor} {curr_posn}")
+                    _print_span(&spans, spans.cursor)
 
                     # Update existing spans
+                    print("UPDATING")
                     end = spans.cursor
+                    spans.cursor += 1
                     for span_idx in range(end):
                         # Continue active spans
                         num_terms_visited = _num_terms(&spans, span_idx)
@@ -218,22 +225,37 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
                             spans.posns[span_idx] |= posn_mask
 
                             new_unique_posns = _num_posns(&spans, span_idx)
+                            print(f"{span_idx}: set_idx + payload_base {set_idx + payload_base} spans.beg[span_idx] {spans.beg[span_idx]}")
                             proposed_width = abs(curr_posn - spans.beg[span_idx])
                             if (num_posns_visited == new_unique_posns) or proposed_width > max_span_width:
                                 # Clear curr_term_mask and cancel this position, we've seen it before
+                                print(f"{span_idx}: Canceling posn -- {proposed_width} {max_span_width} | num_posns_visited:{num_posns_visited} new_unique_posns:{new_unique_posns}")
+                                _print_span(&spans, span_idx)
                                 spans.terms[span_idx] &= ~curr_term_mask
                                 continue
-
+                            if spans.cursor < 128:
+                                print(f"DUPLICATED at end!")
+                                spans.terms[spans.cursor] = spans.terms[span_idx]
+                                spans.posns[spans.cursor] = (spans.posns[span_idx] & ~posn_mask)
+                                spans.beg[spans.cursor] = spans.beg[span_idx]
+                                spans.end[spans.cursor] = spans.end[span_idx]
+                                _print_span(&spans, spans.cursor)
+                                spans.cursor += 1
+                            else:
+                                assert False, "FULL!"
+                                print("FULL!")
+
+                            print(f"{span_idx}:  Before update {curr_posn}")
+                            _print_span(&spans, span_idx)
                             spans.end[span_idx] = curr_posn
+                            print(f"{span_idx}: Updated span w/ posn {curr_posn}")
+                            _print_span(&spans, span_idx)
                             span_width = _span_width(&spans, span_idx)
                             if span_width > max_span_width:
                                 continue
-                            if spans.cursor >= 128:
-                                break
 
                     if spans.cursor >= 128:
                         break
-                    spans.cursor += 1
                     last_set_idx = set_idx
                 curr_idx[term_ord] += 1
                 if curr_idx[term_ord] < lengths[term_ord+1]:
@@ -255,6 +277,7 @@ cdef _span_freqs(DTYPE_t[:] posns,      # Flattened all terms in one array
         # All terms consumed for doc
         collected_spans = _collect_spans(&spans, num_terms, max_span_width)
         phrase_freqs[last_key] += collected_spans.cursor
+        print(f"Doc {last_key} {collected_spans.cursor}")
 
         # Reset
         spans = _new_active_spans()
diff --git a/test/test_phrase_matches.py b/test/test_phrase_matches.py
index d0b6aa5..7ee31ce 100644
--- a/test/test_phrase_matches.py
+++ b/test/test_phrase_matches.py
@@ -219,11 +219,7 @@ def assert_higher_slop_matches(docs, phrase, matches):
             missing = list(set(last_scores_idx) - set(scores_idx))
             assert np.all(np.isin(scores_idx, last_scores_idx)), f"Slop {slop} not subset of {slop - 1} slop -- missing: {missing[0:10]}..."
             where_lt = np.argwhere(scores < last_scores).flatten()
-            try:
-                assert len(where_lt) == 0, f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}"
-            except AssertionError as e:
-                print(f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}")
-                warnings.warn(str(e))
+            assert len(where_lt) == 0, f"Expected {slop} >= {slop - 1} slop -- {scores[where_lt[0:10]]} < {last_scores[where_lt[0:10]]}"
 
 
 @w_scenarios(scenarios)