diff --git a/searcharray/phrase/scan_merge.py b/searcharray/phrase/scan_merge.py index e45b805..aff04a6 100644 --- a/searcharray/phrase/scan_merge.py +++ b/searcharray/phrase/scan_merge.py @@ -2,234 +2,6 @@ from typing import List -def advance_to_scan(arr, target, next_start, idx): - while idx < next_start and not arr[idx] > target: - idx += 1 - return idx - - -def advance_after_binsearch(arr, target, next_start, idx): - """Scan arr from idx until just after target. Return next_start if exhausted. - - Will manually create a skip index if next_start - idx is large - - arr from idx -> next_start should be sorted - - """ - beg = idx - end = next_start - 1 - space = end - beg - if space < 1000: - advance_to_scan(arr, target, next_start, idx) - - # print(f"Binsearching {target} in arr[{beg}:{end}] = {arr[beg:end]}") - if arr[idx] > target: - # print("Already past, returning idx") - return idx - - mid = None - while True: - space = end - beg - if space <= 1000: - return advance_to_scan(arr, target, next_start, beg) - - mid = beg + (space // 2) - if arr[mid] == target: - # print(f"Found exact match, returning {mid + 1}") - return mid + 1 - elif arr[mid] > target and arr[mid - 1] <= target: - # print(f"Found one past, returning {mid}") - return mid - elif arr[mid] < target: - # print(f"Moving beg to {mid}") - # print(f" end is {end}") - # print(f"arr[{mid}] is {arr[mid]}") - beg = mid - else: # elif arr[mid] > target: - # print(f" beg is {beg}") - # print(f"Moving end to {mid}") - # print(f"arr[{mid}] is {arr[mid]}") - end = mid - - # print(f"Exhausted binsearch, returning {next_start}") - return next_start - - -def scan_merge(prior_posns: np.ndarray, - prior_starts: np.ndarray, - next_posns: np.ndarray, - next_starts: np.ndarray, - scan_algo=advance_to_scan, - slop=1): - """Merge two term position lists together into a single list of bigrams. - - Each position list is a flattened representation of multiple docs, ie - - prior_posns: [0,2,1,4,5] - prior_starts: [0,2] - - Points to the first position of each doc in the prior_posns list for this term - - (same for next_posns) - - This is intentionally written to be naive, C-like for later porting to C - - See notebook: - https://colab.research.google.com/drive/10zjUYHGtwMfJMPXz-BHwHe_v6j4MUZzm?authuser=1#scrollTo=W6HuiFGaYCiX - - """ - next_idx = 0 - cont_nexts = [] - cont_next = [] - bigram_freq = 0 - bigram_freqs = [] - last_prior = -2 - next_start = next_starts[0] - prior_start = prior_starts[0] - start_idx = 0 - prior_idx = 0 - while prior_idx < len(prior_posns): - # Scan next until just past prior - prior = prior_posns[prior_idx] - next_idx = scan_algo(next_posns, target=prior, - next_start=next_start, - idx=next_idx) - - # Reset to head of next location, - # Re-advance next - if next_idx >= next_start or prior_idx >= prior_start: - next_idx = next_start - prior_idx = prior_start - last_prior = -2 - - start_idx += 1 - if start_idx >= len(next_starts): - break - next_start = next_starts[start_idx] - prior_start = prior_starts[start_idx] - - # Save and reset output - cont_nexts.append(np.array(cont_next)) - bigram_freqs.append(bigram_freq) - - cont_next = [] - bigram_freq = 0 - - prior = prior_posns[prior_idx] - next_idx = scan_algo(next_posns, target=prior, - next_start=next_start, idx=next_idx) - - if next_idx >= next_start: - continue - - next_posn = next_posns[next_idx] - # Check if within slop - # And is not double counting 0->1->2 (only happens if prior/next identical) - dist = next_posn - prior - if dist <= slop and prior != (last_prior + 1): - cont_next.append(next_posn) - bigram_freq += 1 - last_prior = prior - - prior_idx += 1 - - # Save last output - cont_nexts.append(np.array(cont_next)) - bigram_freqs.append(bigram_freq) - return bigram_freqs, cont_nexts - - -def scan_merge_inplace(prior_posns: np.ndarray, - prior_starts: np.ndarray, - next_posns: np.ndarray, - next_starts: np.ndarray, - scan_algo=advance_to_scan, - slop=1): - """Merge two term position lists together into a single list of bigrams. - - Same as scan_merge, but doesn't need a dynamic array output, instead - overwrites prior* buffers - """ - next_idx = 0 - bigram_freq = 0 - bigram_freqs = [] # this could also be preallocated array output - last_prior = -2 - output_idx = 0 - next_start = next_starts[0] - prior_start = prior_starts[0] - start_idx = 0 - prior_idx = 0 - - while prior_idx < prior_starts[-1]: - # Scan next until just past p - prior = prior_posns[prior_idx] - next_idx = scan_algo(next_posns, - target=prior, - next_start=next_start, - idx=next_idx) - - # Reset to head of next location, - # Re-advance next - if next_idx >= next_start or prior_idx >= prior_start: - next_idx = next_start - prior_idx = prior_start - last_prior = -2 - - prior_starts[start_idx] = output_idx - - start_idx += 1 - if start_idx >= len(next_starts): - break - next_start = next_starts[start_idx] - prior_start = prior_starts[start_idx] - - # Save and reset output - bigram_freqs.append(bigram_freq) - - bigram_freq = 0 - - prior = prior_posns[prior_idx] - next_idx = scan_algo(next_posns, - target=prior, - next_start=next_start, - idx=next_idx) - if next_idx >= next_start: - continue - - next_posn = next_posns[next_idx] - # Check if within slop - # And is not double counting 0->1->2 (only happens if prior/next identical) - dist = next_posn - prior - if dist <= slop and prior != (last_prior + 1): - prior_posns[output_idx] = next_posn - output_idx += 1 - bigram_freq += 1 - last_prior = prior - - prior_idx += 1 - - # Save last output - bigram_freqs.append(bigram_freq) - if start_idx < len(next_starts): - prior_starts[start_idx] = output_idx - - return bigram_freqs, prior_posns, prior_starts - - -def scan_merge_bigram(prior_posns: List, next_posns: List, slop=1): - prior_starts = np.cumsum([lst.shape[0] for lst in prior_posns]) - next_starts = np.cumsum([lst.shape[0] for lst in next_posns]) - prior_posns = np.concatenate(prior_posns) - next_posns = np.concatenate(next_posns) - - result = scan_merge( - prior_posns, prior_starts, - next_posns, next_starts, - slop=slop - ) - return result - - def _self_adjs(prior_posns, next_posns): """Given two arrays of positions, return the self adjacencies. diff --git a/searcharray/phrase/wide_spans.py b/searcharray/phrase/wide_spans.py deleted file mode 100644 index 724ca26..0000000 --- a/searcharray/phrase/wide_spans.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Phrase search as wide span detection.""" -import numpy as np - - -def advance_after(arr, target, idx, next_start): - """Scan arr from idx until just after target. Return next_start if exhausted. - - arr from idx -> next_start should be sorted - - """ - while idx < next_start and not arr[idx] > target: - idx += 1 - return idx - - -def next_wide_span(posns, starts, start_idx, idxs): - """Given N terms find possible acceptable phrases for a slop. - - Parameters: - ----------- - posns - N Numpy arrays, holding all matching document term positions - starts - each term N start boundaries, for all D docs - start_idx - where we are in the starts (aka doc positon) - idxs - where we are in each term - - A wide span begins with the first term in a phrase, and finds the - first location after that where the last term occurs. Acceptable - wide spans are - - Query - "foo bar baz" - posn 1 2 3 4 - t1 foo foo - t2 bar - t3 baz - - posn 1 2 3 ... N -1 N - t1 foo foo - t2 bar bar - t3 baz - - To confirm these wide spans are acceptable phrase matches, we have - to walk backwards from the end and see if phrases occur in order within span - - After consuming this wide span, the next phrase candidate can be had by advancing - T1 to N+1th position. - """ - # 30100000 20.651 0.000 33.100 0.000 wide_spans.py:16(next_wide_span) - # in-place - # 30100000 18.470 0.000 29.952 0.000 wide_spans.py:16(next_wide_span) - # 30100000 18.078 0.000 29.297 0.000 wide_spans.py:16(next_wide_span) - - span_idx = idxs[0] - target = posns[0][span_idx] - - term_idx = 1 - for term in posns[1:]: - next_start = starts[term_idx][start_idx] - span_idx = advance_after(posns[term_idx], - target=target, - idx=idxs[term_idx], - next_start=next_start) - if span_idx == next_start: - return [start[start_idx] for start in starts] - - target = posns[term_idx][span_idx] - idxs[term_idx] = span_idx - term_idx += 1 - - return idxs - - -def get_back_span(posns, starts, start_idx, idxs): - """Given N terms find minimal back span. - - Parameters: - ---------- - posns - N Numpy arrays, holding all matching document term positions - starts - each term N start boundaries, for all D docs - start_idx - where we are in the starts (aka doc positon) - idxs - where we are in each term - - Returns: - -------- - Shortest span backwards, plus positional diff - - A back span for 'foo bar baz' of a wide span, starts with end position - then goes backwards - - Query - "foo bar baz" - posn 1 2 3 4 - t1 foo* foo - t2 bar* - t3 baz* - - posn 1 2 3 ... N -1 N - t1 foo* foo <- scan here - t2 bar* bar <- scan here - t3 baz - - Walking backwards at idxs[-1] we scan idxs[-2] to find - the first occurence (going backwards) - """ - last_posn = posns[-1][idxs[-1]] - back_span_idxs = [idxs[-1]] - - term_idx = len(posns) - 2 - for posn in posns[::-1][1:]: - curr_posn = posn[idxs[term_idx]] - while curr_posn < last_posn: - idxs[term_idx] += 1 - if idxs[term_idx] >= starts[term_idx][start_idx]: - break - curr_posn = posn[idxs[term_idx]] - back_span_idxs.append(idxs[term_idx] - 1) - last_posn = posn[back_span_idxs[-1]] - term_idx -= 1 - back_span_idxs = back_span_idxs[::-1] - return back_span_idxs, (posns[-1][back_span_idxs[-1]] - posns[0][back_span_idxs[0]]) - - -def collect_span(posns, starts, start_idx, idxs, acceptable): - # Advance 0th beyond - beg_posn = posns[0][idxs[0]] - last_end_posn = posns[-1][idxs[-1]] - if (last_end_posn - beg_posn) <= acceptable: - return 1 - else: - back_span, posn_diff = get_back_span(posns, starts, start_idx, idxs) - if posn_diff <= acceptable: - return 1 - return 0 - - -def all_wide_spans_of_slop(posns, starts, slop=1): - """Collect all wide spans of multiple docs for all posn terms. - - Parameters: - ----------- - posns - list of single concatenated array of all doc posns for each term - starts - doc start posn boundaries per term - slop - allowed (in order) moves of term - - Returns: - -------- - phrase frequency of terms in posns satisfying given slop - - """ - start_idx = 0 - span_idxs = np.zeros(len(starts), dtype=np.uint32) - phrases_per_doc = np.zeros(len(starts[0]), dtype=np.uint32) - # Acceptable posn difference (<=) - # 2-gram 3-gram - # slop - # - # 1 1 2 - # 2 2 3 - acceptable = len(posns) + (slop - 2) - first_term_start = starts[0] - num_starts = len(starts[0]) - while True: - if start_idx >= num_starts: - break - elif span_idxs[0] == first_term_start[start_idx]: - span_idxs = [start[start_idx] for start in starts] - start_idx += 1 - else: - span_idxs = next_wide_span(posns, starts, start_idx, span_idxs) - if span_idxs[0] == first_term_start[start_idx]: - # New doc / next start - continue - - last_end_posn = posns[-1][span_idxs[-1]] - phrases_per_doc[start_idx] += collect_span(posns, starts, start_idx, span_idxs, acceptable) - - span_idxs[0] = advance_after(posns[0], - target=last_end_posn, - idx=span_idxs[0], - next_start=first_term_start[start_idx]) - - if span_idxs[0] == first_term_start[-1]: - break - - return phrases_per_doc diff --git a/searcharray/postings.py b/searcharray/postings.py index a82fe51..2c1ba53 100644 --- a/searcharray/postings.py +++ b/searcharray/postings.py @@ -14,8 +14,7 @@ import numpy as np from searcharray.utils.row_viewable_matrix import RowViewableMatrix from searcharray.term_dict import TermDict, TermMissingError -from searcharray.phrase.scan_merge import scan_merge_bigram, scan_merge_inplace, advance_after_binsearch, scan_merge_ins -from searcharray.phrase.wide_spans import all_wide_spans_of_slop +from searcharray.phrase.scan_merge import scan_merge_ins from searcharray.phrase.posn_diffs import compute_phrase_freqs from searcharray.phrase.middle_out import PosnBitArrayBuilder, PosnBitArrayAlreadyEncBuilder, PosnBitArray @@ -665,93 +664,6 @@ def and_query(self, tokens): def phrase_freq(self, tokens, slop=1): return self.phrase_freq_every_diff(tokens, slop=slop) - def phrase_freq_wide_spans(self, tokens, slop=1): - mask = self.and_query(tokens) - - if np.sum(mask) == 0: - return mask - - posns = [self.positions(token, mask) for token in tokens] - starts = [np.cumsum([lst.shape[0] for lst in posn]) for posn in posns] - posns = [np.concatenate(posn) for posn in posns] - - phrase_freqs = np.zeros(len(self), dtype=np.uint32) - freqs = all_wide_spans_of_slop(posns, starts, slop=slop) - phrase_freqs[mask] = freqs - - return phrase_freqs - - def phrase_freq_scan_inplace(self, tokens, slop=1): - mask = self.and_query(tokens) - - if np.sum(mask) == 0: - return mask - - prior_term = tokens[0] - - prior_posns = self.positions(prior_term, mask) - prior_starts = np.cumsum([lst.shape[0] for lst in prior_posns]) - prior_posns = np.concatenate(prior_posns) - - bigram_freqs = None - - phrase_freqs = np.zeros(len(self)) - - for term_cnt, term in enumerate(tokens[1:]): - term_posns = self.positions(term, mask) - term_starts = np.cumsum([lst.shape[0] for lst in term_posns]) - term_posns = np.concatenate(term_posns) - bigram_freqs, prior_posns, prior_starts =\ - scan_merge_inplace( - prior_posns, prior_starts, term_posns, term_starts, slop=slop) - - phrase_freqs[mask] = bigram_freqs - return phrase_freqs - - def phrase_freq_scan_inplace_binsearch(self, tokens, slop=1): - mask = self.and_query(tokens) - - if np.sum(mask) == 0: - return mask - - prior_term = tokens[0] - - prior_posns = self.positions(prior_term, mask) - prior_starts = np.cumsum([lst.shape[0] for lst in prior_posns]) - prior_posns = np.concatenate(prior_posns) - - bigram_freqs = None - - phrase_freqs = np.zeros(len(self)) - - for term_cnt, term in enumerate(tokens[1:]): - term_posns = self.positions(term, mask) - term_starts = np.cumsum([lst.shape[0] for lst in term_posns]) - term_posns = np.concatenate(term_posns) - bigram_freqs, prior_posns, prior_starts =\ - scan_merge_inplace( - prior_posns, prior_starts, term_posns, term_starts, - scan_algo=advance_after_binsearch, - slop=slop) - - phrase_freqs[mask] = bigram_freqs - return phrase_freqs - - def phrase_freq_scan(self, tokens, slop=1): - mask = self.and_query(tokens) - - if np.sum(mask) == 0: - return mask - - prior_term = tokens[0] - prior_posns = self.positions(prior_term, mask) - phrase_freqs = np.zeros(len(self)) - for term_cnt, term in enumerate(tokens[1:]): - term_posns = self.positions(term, mask) - bigram_freqs, prior_posns = scan_merge_bigram(prior_posns, term_posns, slop=slop) - phrase_freqs[mask] = bigram_freqs - return phrase_freqs - def phrase_freq_scan_old(self, tokens, mask=None, slop=1): if mask is None: mask = self.and_query(tokens) diff --git a/test/test_phrase_matches.py b/test/test_phrase_matches.py index 57550a3..d446376 100644 --- a/test/test_phrase_matches.py +++ b/test/test_phrase_matches.py @@ -173,7 +173,7 @@ def test_phrase_api(docs, phrase, expected): @w_scenarios(scenarios) @pytest.mark.parametrize("algorithm", ["phrase_freq", "phrase_freq_scan_old", - "phrase_freq_scan", "phrase_freq_scan_inplace", "phrase_freq_wide_spans"]) + "phrase_freq_every_diff"]) def test_phrase(docs, phrase, expected, algorithm): # if np.all(expected[:5] == [0, 1, 1, 0, 0]) and algorithm in ["phrase_freq_scan", "phrase_freq_scan_inplace"]: # pytest.skip("phrase_freq_scan known failure - different_num_posns_mixed_and_not_phrase") @@ -253,26 +253,6 @@ def test_phrase_performance(docs, phrase, expected): print(f"phrase_match_scan old took {perf_counter() - start} seconds | {len(docs)} docs") assert (matches_scan_old == expected).all() - start = perf_counter() - matches_scan = docs.phrase_freq_scan(phrase) - print(f"phrase_match_scan took {perf_counter() - start} seconds | {len(docs)} docs") - assert (matches_scan == expected).all() - - start = perf_counter() - matches_scan_inplace = docs.phrase_freq_scan_inplace(phrase) - print(f"phrase_match_scan inplace took {perf_counter() - start} seconds | {len(docs)} docs") - assert (matches_scan_inplace == expected).all() - - start = perf_counter() - matches_scan_inplace = docs.phrase_freq_wide_spans(phrase) - print(f"phrase_match_scan widespa took {perf_counter() - start} seconds | {len(docs)} docs") - assert (matches_scan_inplace == expected).all() - - start = perf_counter() - matches_scan_inplace = docs.phrase_freq_scan_inplace_binsearch(phrase) - print(f"phrase_match_scan inplbin took {perf_counter() - start} seconds | {len(docs)} docs") - assert (matches_scan_inplace == expected).all() - def test_positions(): data = PostingsArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25)