From 4dbb8769dfc33fb46003a53f7136d4c9e8f0b8e9 Mon Sep 17 00:00:00 2001 From: Doug Turnbull Date: Sat, 20 Jul 2024 07:53:22 -0400 Subject: [PATCH] More BM25 similarity tests --- test/test_similarity.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/test_similarity.py b/test/test_similarity.py index cb0c72e..60fae98 100644 --- a/test/test_similarity.py +++ b/test/test_similarity.py @@ -30,6 +30,22 @@ def arr(x): "num_docs": 8514, "expected": 3.8199246 }, + "rambo_tmdb": { + "term_freqs": 2, # freq, occurrences of term within document + "doc_freqs": 7, # n, number of documents containing term + "doc_lens": 44, # "dl, length of field (approximate) + "avg_doc_len": 50.580456, # avgdl, average length of field + "num_docs": 8514, # N, total number of documents with field + "expected": 4.5636616 + }, + "the_tmdb": { + "term_freqs": 25, # freq, occurrences of term within document + "doc_freqs": 7823, # n, number of documents containing term + "doc_lens": 152, # "dl, length of field (approximate) + "avg_doc_len": 119.18542, # avgdl, average length of field + "num_docs": 8516, # N, total number of documents with field + "expected": 0.08028283 + } }