Skip to content

Commit

Permalink
add range query compatible engines
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Dec 28, 2022
1 parent b956323 commit e745fc5
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 43 deletions.
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ export

WIKI_SRC = "https://www.dropbox.com/s/wwnfnu441w1ec9p/wiki-articles.json.bz2"

COMMANDS ?= TOP_10 TOP_10_COUNT COUNT
COMMANDS ?= TOP_10 TOP_10_COUNT COUNT

# ENGINES ?= tantivy-0.13 lucene-8.4.0 pisa-0.8.2 rucene-0.1 bleve-0.8.0-scorch rucene-0.1 tantivy-0.11 tantivy-0.14 tantivy-0.15 tantivy-0.16 tantivy-0.17 tantivy-0.18 tantivy-0.19
# ENGINES ?= tantivy-0.13 lucene-8.4.0 pisa-0.8.2 rucene-0.1 bleve-0.8.0-scorch rucene-0.1 tantivy-0.11 tantivy-0.16 tantivy-0.17 tantivy-0.18 tantivy-0.19
# ENGINES ?= tantivy-0.16 lucene-8.10.1 pisa-0.8.2 bleve-0.8.0-scorch rucene-0.1
ENGINES ?= tantivy-0.19 lucene-8.10.1
ENGINES ?= tantivy-0.18 tantivy-0.19 lucene-8.10.1
export RANGE_QUERY_ENABLED_ENGINES ?= tantivy-0.18 tantivy-0.19 lucene-8.10.1 lucene-8.0.0 lucene-7.2.1
PORT ?= 8080

help:
Expand Down
4 changes: 4 additions & 0 deletions engines/lucene-7.2.1/src/main/java/BuildIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ public static void main(String[] args) throws IOException {
final Document document = new Document();

StoredField idField = new StoredField("id", "");
IntPoint idNumField = new IntPoint("id_num", 0);
TextField textField = new TextField("text", "", Field.Store.NO);

document.add(idField);
document.add(idNumField);
document.add(textField);

String line;
Expand All @@ -37,8 +39,10 @@ public static void main(String[] args) throws IOException {
}
final JsonObject parsed_doc = Json.parse(line).asObject();
final String id = parsed_doc.get("id").asString();
final int id_num = parsed_doc.get("id_num").asInt();
final String text = parsed_doc.get("text").asString();
idField.setStringValue(id);
idNumField.setIntValue(id_num);
textField.setStringValue(text);
writer.addDocument(document);
}
Expand Down
4 changes: 4 additions & 0 deletions engines/lucene-8.0.0/src/main/java/BuildIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ public static void main(String[] args) throws IOException {
final Document document = new Document();

StoredField idField = new StoredField("id", "");
IntPoint idNumField = new IntPoint("id_num", 0);
TextField textField = new TextField("text", "", Field.Store.NO);

document.add(idField);
document.add(idNumField);
document.add(textField);

String line;
Expand All @@ -37,8 +39,10 @@ public static void main(String[] args) throws IOException {
}
final JsonObject parsed_doc = Json.parse(line).asObject();
final String id = parsed_doc.get("id").asString();
final int id_num = parsed_doc.get("id_num").asInt();
final String text = parsed_doc.get("text").asString();
idField.setStringValue(id);
idNumField.setIntValue(id_num);
textField.setStringValue(text);
writer.addDocument(document);
}
Expand Down
29 changes: 17 additions & 12 deletions engines/tantivy-0.13/src/bin/build_index.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Index;
use futures::executor::block_on;
use std::env;
use std::io::BufRead;
use std::path::Path;
use futures::executor::block_on;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Index;

fn main() {
let args: Vec<String> = env::args().collect();
main_inner(&Path::new(&args[1])).unwrap();
}

fn create_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("id", STORED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("id", STORED);
schema_builder.add_u64_field("id_num", FAST | INDEXED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
}

fn main_inner(output_dir: &Path) -> tantivy::Result<()> {
Expand All @@ -25,7 +26,9 @@ fn main_inner(output_dir: &Path) -> tantivy::Result<()> {

let mut i = 0;
{
let mut index_writer = index.writer_with_num_threads(4, 2_000_000_000).expect("failed to create index writer");
let mut index_writer = index
.writer_with_num_threads(4, 2_000_000_000)
.expect("failed to create index writer");
let stdin = std::io::stdin();

for line in stdin.lock().lines() {
Expand All @@ -44,9 +47,11 @@ fn main_inner(output_dir: &Path) -> tantivy::Result<()> {
index_writer.commit()?;
index_writer.wait_merging_threads()?;
}
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer(1_500_000_000).expect("failed to create index writer");
block_on(index_writer.merge(&segment_ids))?;
block_on(index_writer.garbage_collect_files())?;
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index
.writer(1_500_000_000)
.expect("failed to create index writer");
block_on(index_writer.merge(&segment_ids))?;
block_on(index_writer.garbage_collect_files())?;
Ok(())
}
29 changes: 17 additions & 12 deletions engines/tantivy-0.16/src/bin/build_index.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Index;
use futures::executor::block_on;
use std::env;
use std::io::BufRead;
use std::path::Path;
use futures::executor::block_on;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Index;

fn main() {
let args: Vec<String> = env::args().collect();
main_inner(&Path::new(&args[1])).unwrap();
}

fn create_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("id", STORED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("id", STORED);
schema_builder.add_u64_field("id_num", FAST | INDEXED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
}

fn main_inner(output_dir: &Path) -> tantivy::Result<()> {
Expand All @@ -25,7 +26,9 @@ fn main_inner(output_dir: &Path) -> tantivy::Result<()> {

let mut i = 0;
{
let mut index_writer = index.writer_with_num_threads(4, 2_000_000_000).expect("failed to create index writer");
let mut index_writer = index
.writer_with_num_threads(4, 2_000_000_000)
.expect("failed to create index writer");
let stdin = std::io::stdin();

for line in stdin.lock().lines() {
Expand All @@ -44,9 +47,11 @@ fn main_inner(output_dir: &Path) -> tantivy::Result<()> {
index_writer.commit()?;
index_writer.wait_merging_threads()?;
}
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer(1_500_000_000).expect("failed to create index writer");
block_on(index_writer.merge(&segment_ids))?;
block_on(index_writer.garbage_collect_files())?;
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index
.writer(1_500_000_000)
.expect("failed to create index writer");
block_on(index_writer.merge(&segment_ids))?;
block_on(index_writer.garbage_collect_files())?;
Ok(())
}
29 changes: 17 additions & 12 deletions engines/tantivy-0.17/src/bin/build_index.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Index;
use futures::executor::block_on;
use std::env;
use std::io::BufRead;
use std::path::Path;
use futures::executor::block_on;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Index;

fn main() {
let args: Vec<String> = env::args().collect();
main_inner(&Path::new(&args[1])).unwrap();
}

fn create_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("id", STORED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("id", STORED);
schema_builder.add_u64_field("id_num", FAST | INDEXED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
}

fn main_inner(output_dir: &Path) -> tantivy::Result<()> {
Expand All @@ -25,7 +26,9 @@ fn main_inner(output_dir: &Path) -> tantivy::Result<()> {

let mut i = 0;
{
let mut index_writer = index.writer_with_num_threads(4, 2_000_000_000).expect("failed to create index writer");
let mut index_writer = index
.writer_with_num_threads(4, 2_000_000_000)
.expect("failed to create index writer");
let stdin = std::io::stdin();

for line in stdin.lock().lines() {
Expand All @@ -44,9 +47,11 @@ fn main_inner(output_dir: &Path) -> tantivy::Result<()> {
index_writer.commit()?;
index_writer.wait_merging_threads()?;
}
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer(1_500_000_000).expect("failed to create index writer");
block_on(index_writer.merge(&segment_ids))?;
block_on(index_writer.garbage_collect_files())?;
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index
.writer(1_500_000_000)
.expect("failed to create index writer");
block_on(index_writer.merge(&segment_ids))?;
block_on(index_writer.garbage_collect_files())?;
Ok(())
}
2 changes: 1 addition & 1 deletion results.json

Large diffs are not rendered by default.

25 changes: 23 additions & 2 deletions src/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

class SearchClient:

def __init__(self, engine):
def __init__(self, engine, unsupported_queries):
self.engine = engine
self.unsupported_queries = unsupported_queries
dirname = os.path.split(os.path.abspath(__file__))[0]
dirname = path.dirname(dirname)
dirname = path.join(dirname, "engines")
Expand All @@ -23,6 +24,8 @@ def __init__(self, engine):
stdin=subprocess.PIPE)

def query(self, query, command):
if query in unsupported_queries:
return None
query_line = "%s\t%s\n" % (command, query)
self.process.stdin.write(query_line.encode("utf-8"))
self.process.stdin.flush()
Expand Down Expand Up @@ -57,19 +60,37 @@ def read_queries(query_path):
WARMUP_ITER = 1
NUM_ITER = 3

def filter_non_range_queries(queries):
return [query for query in queries if 'range' not in query.tags]

def get_range_queries(queries):
range_queries = set()
for query in queries:
if 'range' in query.tags:
range_queries.add(query.query)
return range_queries

if __name__ == "__main__":
import sys
random.seed(2)
query_path = sys.argv[1]
engines = sys.argv[2:]
range_query_enabled_engines = os.environ['RANGE_QUERY_ENABLED_ENGINES'].split(" ")
range_query_enabled_engines = [engine.strip() for engine in range_query_enabled_engines]
queries = list(read_queries(query_path))
# non_range_queries = filter_non_range_queries(queries)
range_queries = get_range_queries(queries)
results = {}
for command in COMMANDS:
results_commands = {}
for engine in engines:
engine_results = []
query_idx = {}
if engine in range_query_enabled_engines:
unsupported_queries = set()
else:
unsupported_queries = range_queries

for query in queries:
query_result = {
"query": query.query,
Expand All @@ -81,7 +102,7 @@ def read_queries(query_path):
engine_results.append(query_result)
print("======================")
print("BENCHMARKING %s %s" % (engine, command))
search_client = SearchClient(engine)
search_client = SearchClient(engine, unsupported_queries)
print("--- Warming up ...")
queries_shuffled = list(queries[:])
random.seed(2)
Expand Down
2 changes: 1 addition & 1 deletion web/build/results.json

Large diffs are not rendered by default.

0 comments on commit e745fc5

Please sign in to comment.