-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8e20db0
commit 5664c05
Showing
2 changed files
with
101 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,63 @@ | ||
from searcharray.postings import PostingsArray | ||
|
||
|
||
def test_phrase_match(): | ||
data = PostingsArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25) | ||
matches = data.phrase_match(["foo", "bar"]) | ||
assert (matches == [True, False, False, False] * 25).all() | ||
|
||
|
||
def test_phrase_match_three_terms(): | ||
data = PostingsArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25) | ||
matches = data.phrase_match(["bunny", "funny", "wunny"]) | ||
assert (matches == [False, False, False, True] * 25).all() | ||
|
||
|
||
def test_phrase_match_three_terms_spread_out_doesnt_match(): | ||
spread_out = PostingsArray.index(["foo bar EEK foo URG bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25) | ||
matches = spread_out.phrase_match(["foo", "bar", "baz"]) | ||
assert (matches == [False, False, False, False] * 25).all() | ||
|
||
|
||
def test_phrase_match_same_term_matches(): | ||
spread_out = PostingsArray.index(["foo foo foo", "data2", "data3 bar", "bunny funny wunny"] * 25) | ||
matches = spread_out.phrase_match(["foo", "foo", "foo"]) | ||
assert (matches == [True, False, False, False] * 25).all() | ||
|
||
|
||
def test_phrase_match_duplicate_phrases(): | ||
multiple = PostingsArray.index(["foo bar foo bar", "data2", "data3 bar", "bunny funny wunny"] * 25) | ||
matches = multiple.phrase_match(["foo", "bar"]) | ||
assert (matches == [True, False, False, False] * 25).all() | ||
from test_utils import w_scenarios | ||
from time import perf_counter | ||
|
||
|
||
scenarios = { | ||
"base": { | ||
"docs": PostingsArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "bar"], | ||
"expected": [True, False, False, False] * 25, | ||
}, | ||
"multi_term_one_doc": { | ||
"docs": PostingsArray.index(["foo bar bar bar foo", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "bar"], | ||
"expected": [True, False, False, False] * 25, | ||
}, | ||
"three_terms_match": { | ||
"docs": PostingsArray.index(["foo bar baz baz", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "bar", "baz"], | ||
"expected": [True, False, False, False] * 25, | ||
}, | ||
"three_terms_no_match": { | ||
"docs": PostingsArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "bar", "baz"], | ||
"expected": [False, False, False, False] * 25, | ||
}, | ||
"many_docs": { | ||
"docs": PostingsArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 100000), | ||
"phrase": ["foo", "bar"], | ||
"expected": [True, False, False, False] * 100000, | ||
}, | ||
"three_terms_spread_out": { | ||
"docs": PostingsArray.index(["foo bar EEK foo URG bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "bar", "baz"], | ||
"expected": [False, False, False, False] * 25, | ||
}, | ||
"same_term_matches": { | ||
"docs": PostingsArray.index(["foo foo foo", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "foo"], | ||
"expected": [True, False, False, False] * 25, | ||
}, | ||
"same_term_matches_3": { | ||
"docs": PostingsArray.index(["foo foo foo", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "foo", "foo"], | ||
"expected": [True, False, False, False] * 25, | ||
}, | ||
"duplicate_phrases": { | ||
"docs": PostingsArray.index(["foo bar foo bar", "data2", "data3 bar", "bunny funny wunny"] * 25), | ||
"phrase": ["foo", "bar"], | ||
"expected": [True, False, False, False] * 25, | ||
}, | ||
} | ||
|
||
|
||
@w_scenarios(scenarios) | ||
def test_phrase(docs, phrase, expected): | ||
start = perf_counter() | ||
docs_before = docs.copy() | ||
matches = docs.phrase_match(phrase) | ||
print(f"phrase_match took {perf_counter() - start} seconds | {len(docs)} docs") | ||
assert (matches == expected).all() | ||
if len(docs) < 1000: | ||
assert (docs == docs_before).all(), "The phrase_match method should not modify the original array" |