Skip to content

Commit

Permalink
Confirm searching indexed empty dataframes works
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jul 13, 2024
1 parent 016eddc commit fe3a428
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 2 deletions.
2 changes: 1 addition & 1 deletion searcharray/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _compute_doc_lens(posns: np.ndarray, doc_ids: np.ndarray, num_docs: int) ->
non_empty_doc_ids = doc_ids[non_empty_idxs]
non_empty_doc_lens = non_empty_doc_lens[non_empty_idxs]
doc_lens[non_empty_doc_ids] = non_empty_doc_lens
if doc_ids[-1] not in non_empty_doc_ids:
if len(doc_ids) > 0 and doc_ids[-1] not in non_empty_doc_ids:
doc_lens[doc_ids[-1]] = posns[-1] + 1
return doc_lens

Expand Down
2 changes: 2 additions & 0 deletions searcharray/phrase/middle_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def build(self):
encoded, enc_term_boundaries = encoder.encode(keys=self.flat_array[1].view(np.uint64),
boundaries=term_boundaries[:-1],
payload=self.flat_array[2].view(np.uint64))
if len(encoded) == 0:
return PosnBitArray({}, self.max_doc_id)
term_ids = self.flat_array[0][term_boundaries[:-1]]

encoded_term_posns = ArrayDict.from_array_with_boundaries(encoded,
Expand Down
5 changes: 4 additions & 1 deletion searcharray/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ def compute_idf(num_docs, dfs):


def compute_adj_doc_lens(doc_lens, avg_doc_lens, k1, b):
adj_doc_lens = doc_lens / avg_doc_lens
if avg_doc_lens == 0:
adj_doc_lens = np.zeros_like(doc_lens, dtype=np.float32)
else:
adj_doc_lens = doc_lens / avg_doc_lens
adj_doc_lens *= b
adj_doc_lens += 1 - b
adj_doc_lens *= k1
Expand Down
11 changes: 11 additions & 0 deletions test/test_search.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Test postings array search functionality."""
import numpy as np
import pandas as pd
import pytest
from searcharray.postings import SearchArray
from searcharray.similarity import bm25_similarity
Expand All @@ -12,6 +13,16 @@ def data():
return SearchArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25)


@pytest.fixture
def all_empty_str():
return pd.DataFrame({"data": [""] * 100})


def test_search_empty_str(all_empty_str):
data = SearchArray.index(all_empty_str["data"])
assert data.score("foo").sum() == 0


def test_match(data):
matches = data.termfreqs("foo") > 0
assert (matches == [True, False, False, False] * 25).all()
Expand Down

0 comments on commit fe3a428

Please sign in to comment.