Skip to content

Commit

Permalink
Fix concat with index creation
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Jul 12, 2024
1 parent b2ef647 commit 156b65f
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 16 deletions.
42 changes: 28 additions & 14 deletions searcharray/phrase/memmap_arrays.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import os
from typing import Optional, List
from typing import Optional, List, Dict


def create_filename(data_dir: str):
Expand Down Expand Up @@ -49,21 +49,35 @@ def from_array_with_boundaries(data: np.ndarray,

@staticmethod
def concat(lhs: 'ArrayDict', rhs: 'ArrayDict', sort=True):
metadata = {}
offset = 0
for key, value in lhs.metadata.items():
metadata[key] = {'offset': offset, 'length': value['length']}
offset += value['length']
for key, value in rhs.metadata.items():
metadata[key] = {'offset': offset, 'length': value['length']}
offset += value['length']
data = np.concatenate((lhs.data, rhs.data))
if sort:
keys = sorted(metadata.keys())
metadata = {key: metadata[key] for key in keys}
arr = ArrayDict()
arr.data = data
metadata: Dict[int, Dict[str, int]] = {}
lst_of_arrays: List[np.ndarray] = []

curr_offset = 0
last_offset = 0

fetched_keys = set()
for key, value in lhs.items():
fetched_keys.add(key)
all_for_key = [value]
curr_offset += value.size
if key in rhs.metadata:
rhs_value = rhs[key]
all_for_key.append(rhs_value)
curr_offset += rhs_value.size
metadata[key] = {'offset': last_offset, 'length': curr_offset - last_offset}
last_offset = curr_offset
lst_of_arrays.append(np.concatenate(all_for_key))

for key, value in rhs.items():
if key not in fetched_keys:
curr_offset += value.size
lst_of_arrays.append(value)
metadata[key] = {'offset': last_offset, 'length': curr_offset}
last_offset = curr_offset

arr.metadata = metadata
arr.data = np.concatenate(lst_of_arrays)
return arr

def __getitem__(self, key):
Expand Down
34 changes: 32 additions & 2 deletions test/test_tmdb.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,49 @@
import pytest
import gzip
import os
from time import perf_counter
import json
import pandas as pd
import numpy as np
import sys
import shutil
from searcharray.postings import SearchArray
from searcharray.solr import edismax
from searcharray.similarity import default_bm25
from test_utils import Profiler, profile_enabled, naive_find_term


DATA_DIR = '/tmp/tmdb'


should_profile = '--benchmark-disable' in sys.argv


def clean_data_dir():
try:
shutil.rmtree(DATA_DIR)
except FileNotFoundError:
pass


def ensure_data_dir_exists():
try:
shutil.rmtree(DATA_DIR)
except FileNotFoundError:
pass
try:
os.makedirs(DATA_DIR)
except FileExistsError:
pass


@pytest.fixture(scope="session", autouse=True)
def clean_up():
ensure_data_dir_exists()
yield
clean_data_dir()


@pytest.fixture(scope="session")
def tmdb_raw_data():
path = 'fixtures/tmdb.json.gz'
Expand Down Expand Up @@ -48,7 +78,7 @@ def tmdb_data(tmdb_pd_data, request):
df = tmdb_pd_data
indexed = SearchArray.index(df['title'],
batch_size=5000 if request.param in ["small_batch", "smallbatch_memmap"] else 100000,
data_dir="/tmp/" if request.param == "memmap" else None)
data_dir=DATA_DIR if request.param == "memmap" else None)
df['title_tokens'] = indexed

# set last 3 overview strings to empty
Expand All @@ -58,7 +88,7 @@ def tmdb_data(tmdb_pd_data, request):

indexed = SearchArray.index(df['overview'],
batch_size=5000 if request.param in ["small_batch", "smallbatch_memmap"] else 100000,
data_dir="/tmp/" if request.param == "memmap" else None)
data_dir=DATA_DIR if request.param == "memmap" else None)
df['overview_tokens'] = indexed
return df

Expand Down

0 comments on commit 156b65f

Please sign in to comment.