Skip to content

Commit

Permalink
Add building index from eager terms
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jun 11, 2024
1 parent 9eb38ed commit 491636b
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 41 deletions.
31 changes: 30 additions & 1 deletion searcharray/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import islice
from searcharray.phrase.middle_out import MAX_POSN, PosnBitArrayFromFlatBuilder, PosnBitArrayBuilder, PosnBitArrayAlreadyEncBuilder
from searcharray.phrase.middle_out import MAX_POSN, PosnBitArrayFromFlatBuilder, PosnBitArrayAlreadyEncBuilder
from searcharray.term_dict import TermDict
from searcharray.utils.mat_set import SparseMatSetBuilder
from searcharray.utils.row_viewable_matrix import RowViewableMatrix
Expand Down Expand Up @@ -227,3 +227,32 @@ def build_index_from_tokenizer(array: Iterable, tokenizer, batch_size=10000,
term_doc_built = RowViewableMatrix(term_doc.build())
logger.info("Indexing from tokenization complete")
return term_doc_built, bit_posns, term_dict, avg_doc_length, np.array(doc_lens)


def build_index_from_eager_terms(array):
term_dict = TermDict()
term_doc = SparseMatSetBuilder()
doc_lens = []
avg_doc_length = 0
num_postings = 0
posns = PosnBitArrayAlreadyEncBuilder()

for doc_id, doc in enumerate(array):
doc_lens.append(doc.doc_len)
avg_doc_length += doc_lens[-1]
terms = []
for token, term_freq in doc.terms():
term_id = term_dict.add_term(token)
terms.append(term_id)
positions = doc.positions(token)
if positions is not None:
posns.add_posns(doc_id, term_id, positions)
term_doc.append(terms)

posns.ensure_capacity(doc_id)
num_postings += 1

if num_postings > 0:
avg_doc_length /= num_postings
bit_posns = posns.build()
return RowViewableMatrix(term_doc.build()), bit_posns, term_dict, avg_doc_length, np.array(doc_lens)
108 changes: 80 additions & 28 deletions searcharray/postings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@
from collections import Counter
import warnings
import logging
from typing import List, Union, Optional, Iterable, Iterator, Any
from typing import List, Union, Optional, Iterable, Iterator, Any, Sequence


import numpy as np
from searcharray.phrase.middle_out import PosnBitArray
from searcharray.similarity import Similarity, default_bm25
from searcharray.indexing import build_index_from_tokenizer
from searcharray.indexing import build_index_from_tokenizer, build_index_from_eager_terms
from searcharray.term_dict import TermMissingError
from searcharray.roaringish.roaringish_ops import as_dense
from searcharray.utils.mat_set import SparseMatSet
from searcharray.term_dict import TermDict

logger = logging.getLogger(__name__)

Expand All @@ -35,7 +36,7 @@ class EagerTerms:
"""An indexed search doc - a single bag of tokenized words and positions."""

def __init__(self,
postings,
postings: dict[str, int],
doc_len: int = 0,
posns: Optional[dict] = None,
encoded=False):
Expand Down Expand Up @@ -146,33 +147,39 @@ class LazyTerms:
"""Implements a view to a doc in the postings, but only
fetches the postings when needed."""

def __init__(self, doc_id=-1, posns=None, terms=None):
def __init__(self, doc_id=-1,
posns: Optional[PosnBitArray] = None,
terms: Optional[SparseMatSet] = None,
tokenizer=None,
term_dict=None):
self.posns = posns
self.doc_id = doc_id
self.terms = terms
if self.terms is None:
self.terms = SparseMatSet()
self.terms: SparseMatSet = SparseMatSet() if terms is None else terms
self.term_dict = TermDict() if term_dict is None else term_dict
self.tokenizer = tokenizer

def __eq__(self, other):
# Flip to the other implementation if we're comparing to a SearchArray
# to get a boolean array back
if isinstance(other, SearchArray):
return other == self
return isinstance(other, LazyTerms) \
and self.term_dict.compatible(other.term_dict) \
and self.tokenizer == other.tokenizer \
and len(self.terms.cols) == len(other.terms.cols) \
and np.all(self.terms.cols == other.terms.cols)

def __len__(self):
return len(self.terms)

def __repr__(self):
return f"LazyTerms(doc_id={self.doc_id})"
return f"LazyTerms(doc_id={self.doc_id}, posns={id(self.posns)})"

def __str__(self):
return f"LazyTerms(doc_id={self.doc_id})"
return f"LazyTerms(doc_id={self.doc_id}), posns={id(self.posns)})"

def __lt__(self, other):
return self.doc_id < other.doc_id
return hash(self.terms) < hash(other.terms)

def __le__(self, other):
return self.doc_id < other.doc_id or self.doc_id == other.doc_id
Expand All @@ -184,25 +191,36 @@ def __hash__(self):
return hash(str(self.doc_id))

def to_eager(self) -> EagerTerms:
posns = self.raw_positions(self.posns, self.terms)
doc_len = len(self.terms)
return EagerTerms(self.terms, doc_len=doc_len, posns=posns,
"""Conversion to an eager view is expensive!"""
raw_posns = self.raw_positions()
doc_len = 0
# Get term freqs for terms
tfs_as_dicts = {}
if self.posns is not None:
for term_id in self.terms.cols:
tf = self.posns.termfreqs(term_id,
doc_ids=np.asarray([self.doc_id]))[1][0]
doc_len += tf
tfs_as_dicts[term_id] = tf
return EagerTerms(tfs_as_dicts,
doc_len=doc_len,
posns=raw_posns,
encoded=True)

def raw_positions(self, term_dict, term=None):
def raw_positions(self, term=None):
tfs = {}
posns = {}
for term_idx in self.terms:
tfs[term] = 1
enc_term_posns = posns.doc_encoded_posns(term_idx, doc_id=self.doc_id)
posns[term] = enc_term_posns
for term_idx in self.terms.cols:
tfs[term_idx] = 1
enc_term_posns = self.posns.doc_encoded_posns(term_idx, doc_id=self.doc_id)
posns[term_idx] = enc_term_posns

if posns is None:
return {}
if term is None:
raw_posns = [(term_dict.get_term_id(term), posns) for term, posns in posns.items()]
return posns
else:
raw_posns = [(term_dict.get_term_id(term), posns[term])]
raw_posns = [(self.term_dict.get_term_id(term), posns[term])]
return raw_posns


Expand Down Expand Up @@ -336,6 +354,35 @@ def index(cls, array: Iterable,
postings.corpus_size = len(doc_lens)
return postings

@classmethod
def from_docs(cls, terms: Sequence[LazyTerms]) -> 'SearchArray':
"""Create a SearchArray from a list of LazyTerms.
Will extract the term frequencies and positions from each LazyTerms (expensive)
and construct a new SearchArray.
"""
term_mat = SparseMatSet()
tokenizer = terms[0].tokenizer if len(terms) > 0 else None
# Check for compatibility
eager_terms = []
for term in terms:
if not term.tokenizer == tokenizer:
raise ValueError("All terms must have the same tokenizer")
if not term.term_dict.compatible(terms[0].term_dict):
raise ValueError("All terms must have the same term dictionary")
eager_terms.append(term.to_eager())
term_mat, posns, term_dict, avg_doc_length, doc_lens =\
build_index_from_eager_terms(eager_terms)

postings = cls(tokenizer=tokenizer, avoid_copies=True)
postings.term_mat = term_mat
postings.posns = posns
postings.term_dict = term_dict
postings.avg_doc_length = avg_doc_length
postings.doc_lens = doc_lens
postings.corpus_size = len(terms)
return postings

def warm(self):
self.posns.warm()

Expand All @@ -346,14 +393,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
if not isinstance(dtype, TermsDtype):
return scalars
if isinstance(scalars, np.ndarray) and scalars.dtype == TermsDtype():
return cls(scalars)
return cls.from_docs(scalars)
# String types
elif isinstance(scalars, np.ndarray) and scalars.dtype.kind in 'US':
return cls(scalars)
return cls.from_docs(scalars)
# Other objects
elif isinstance(scalars, np.ndarray) and scalars.dtype != object:
return scalars
return cls(scalars)
return cls.from_docs(scalars)

def memory_usage(self, deep=False):
"""Return memory usage of this array in bytes."""
Expand All @@ -370,7 +417,12 @@ def __getitem__(self, key):
try:
# rows = self.term_mat[key]
doc_id = key
return LazyTerms(doc_id, self.posns, self.term_mat[key])
if len(self.term_mat[key].cols) == 0:
return LazyTerms()
return LazyTerms(doc_id=doc_id, posns=self.posns,
terms=self.term_mat[key],
tokenizer=self.tokenizer,
term_dict=self.term_dict)
except IndexError:
raise IndexError("index out of bounds")
else:
Expand Down Expand Up @@ -507,17 +559,17 @@ def copy(self):

@classmethod
def _concat_same_type(cls, to_concat):
import pdb; pdb.set_trace()
concatenated_data = np.concatenate([ea[:] for ea in to_concat])
return SearchArray(concatenated_data, tokenizer=to_concat[0].tokenizer)
return cls.from_docs(concatenated_data)

@classmethod
def _from_factorized(cls, values, original):
return cls(values)
raise NotImplementedError("Factorization/Grouping not supported by SearchArray")

def _values_for_factorize(self):
"""Return an array and missing value suitable for factorization (ie grouping)."""
arr = np.asarray(self[:], dtype=object)
return arr, LazyTerms()
raise NotImplementedError("Factorization/Grouping not supported by SearchArray")

def _check_token_arg(self, token):
if isinstance(token, str):
Expand Down
3 changes: 3 additions & 0 deletions searcharray/utils/mat_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,6 @@ def __str__(self):
for idx, (row, row_next) in enumerate(zip(self.rows, self.rows[1:])):
as_str.append(f"{idx}: {self.cols[row:row_next]}")
return "\n".join(as_str)

def __hash__(self):
return hash((self.cols.tobytes(), self.rows.tobytes()))
65 changes: 53 additions & 12 deletions test/test_extension_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ def dtype():
@pytest.fixture
def data():
"""Return a fixture of your data here that returns an instance of your ExtensionArray."""
return SearchArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25)
arr = SearchArray.index(["foo bar bar baz", "data2", "data3 bar", "bunny funny wunny"] * 25)
for idx, item in enumerate(arr):
assert idx == item.doc_id
return arr


@pytest.fixture
Expand Down Expand Up @@ -68,15 +71,14 @@ def data_for_sorting():
This should be three items [B, C, A] with
A < B < C
pytest.skip("Grouping not supported by SearchArray")
"""
arr = SearchArray.index(["abba mmma dabbb", "abba abba aska", "caa cata"])
return arr
pass


@pytest.fixture
def data_missing_for_sorting():
arr = SearchArray.index(["abba mmma dabbb", "", "caa cata"])
return arr
pytest.skip("Grouping not supported by SearchArray")


@pytest.fixture
Expand All @@ -87,11 +89,7 @@ def data_for_grouping():
Where A < B < C and NA is missing
"""
arr = SearchArray.index(["abba mmma dabbb", "abba mmma dabbb",
"", "",
"caa cata", "caa cata",
"abba mmma dabbb", "abba abba aska"])
return arr
pytest.skip("Grouping not supported by SearchArray")


@pytest.fixture(
Expand Down Expand Up @@ -137,6 +135,10 @@ def fillna_method(request):
return request.param


def test_na_values_eq():
assert LazyTerms() == LazyTerms()


# Then create a class that inherits from the base tests you want to use
class TestDType(base.BaseDtypeTests):
# You'll need to at least provide the following attributes
Expand All @@ -152,10 +154,49 @@ class TestMethods(base.BaseMethodsTests):
# Unique not supported on inverted index rows, for performance
# reasons
def test_value_counts_with_normalize(self, data):
pass
pytest.skip("Unique not supported on inverted index rows, for performance reasons")

def test_unique(self, data):
pass
pytest.skip("Unique not supported on inverted index rows, for performance reasons")

def test_argsort(self):
pytest.skip("sorting not supported for inverted index rows")

def test_argsort_missing(self):
pytest.skip("sorting not supported for inverted index rows")

def test_nargsort(self, data_for_sorting):
pytest.skip("sorting not supported for inverted index rows")

def test_sort_values_missing(self, data_missing_for_sorting):
pytest.skip("sorting not supported for inverted index rows")

def test_argsort_missing_array(self, data_missing_for_sorting):
pytest.skip("sorting not supported for inverted index rows")

@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
pytest.skip("sorting not supported for inverted index rows")

def test_argmin_argmax(self, data):
pytest.skip("argmin and argmax not supported for inverted index rows")

def test_argmin_argmax_all_na(self, data_missing):
pytest.skip("argmin and argmax not supported for inverted index rows")

def test_argreduce_series(self, data):
pytest.skip("argmin and argmax not supported for inverted index rows")

def test_factorize_empty(self, data):
pytest.skip("factorize not supported for inverted index rows")

def test_searchsorted(self, data):
pytest.skip("searchsorted not supported for inverted index rows")

def test_sort_values_frame(self, data_for_sorting, sort_by_key):
pytest.skip("sorting not supported for inverted index rows")




class TestReshaping(base.BaseReshapingTests):
Expand Down

0 comments on commit 491636b

Please sign in to comment.