From 604c678f398d4c9553227c4aafba8bd3e8c11411 Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Wed, 15 May 2024 09:56:57 -0400 Subject: [PATCH] Change test on compatibility given threading --- searcharray/indexing.py | 1 + test/test_tmdb.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/searcharray/indexing.py b/searcharray/indexing.py index f804b8b..13ef53d 100644 --- a/searcharray/indexing.py +++ b/searcharray/indexing.py @@ -153,6 +153,7 @@ def _process_batches(term_doc, batch_size, doc_lens=None, truncate=False): batch_results = [None] * len(futures) + batch_beg = 0 for future in as_completed(futures): try: batch_beg, batch_term_doc, batch_bit_posns, batch_doc_lens = future.result() diff --git a/test/test_tmdb.py b/test/test_tmdb.py index 6c01c26..63ddf98 100644 --- a/test/test_tmdb.py +++ b/test/test_tmdb.py @@ -102,7 +102,12 @@ def test_slice_then_search(tmdb_data): def test_batch_sizes_give_same(tmdb_data): with_batch_10k = SearchArray.index(tmdb_data['overview'], batch_size=10000) with_batch_5k = SearchArray.index(tmdb_data['overview'], batch_size=5000) - assert np.all(with_batch_10k == with_batch_5k) + # We don't expect the full array to be compatible given term dict assigned + # different term ids given threading, but individual docs should be the same + assert np.all(with_batch_10k[-1] == with_batch_5k[-1]) + assert np.all(with_batch_10k[100] == with_batch_5k[100]) + assert np.all(with_batch_10k[5000] == with_batch_5k[5000]) + assert np.all(with_batch_10k[5001] == with_batch_5k[5001]) tmdb_term_matches = [