Skip to content

Commit

Permalink
add range queries
Browse files Browse the repository at this point in the history
add new field id_num
add range queries over id_num
  • Loading branch information
PSeitz committed Dec 28, 2022
1 parent 985601d commit 64abd2d
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ COMMANDS ?= TOP_10 TOP_10_COUNT COUNT

# ENGINES ?= tantivy-0.13 lucene-8.4.0 pisa-0.8.2 rucene-0.1 bleve-0.8.0-scorch rucene-0.1 tantivy-0.11 tantivy-0.14 tantivy-0.15 tantivy-0.16 tantivy-0.17 tantivy-0.18 tantivy-0.19
# ENGINES ?= tantivy-0.16 lucene-8.10.1 pisa-0.8.2 bleve-0.8.0-scorch rucene-0.1
ENGINES ?= tantivy-0.16 tantivy-0.17 tantivy-0.18 tantivy-0.19
ENGINES ?= tantivy-0.19 lucene-8.10.1
PORT ?= 8080

help:
Expand Down
3 changes: 3 additions & 0 deletions corpus_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ def transform(text):

if doc["url"] == "":
continue
if len(doc["url"].split("curid=",1)) == 1:
continue

doc_transformed = {
"id": doc["url"],
"id_num": int(doc["url"].split("curid=",1)[1]),
"text": transform(doc["body"])
}

Expand Down
4 changes: 4 additions & 0 deletions engines/lucene-8.10.1/src/main/java/BuildIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ public static void main(String[] args) throws IOException {
final Document document = new Document();

StoredField idField = new StoredField("id", "");
IntPoint idNumField = new IntPoint("id_num", 4);
TextField textField = new TextField("text", "", Field.Store.NO);

document.add(idField);
document.add(idNumField);
document.add(textField);

String line;
Expand All @@ -37,8 +39,10 @@ public static void main(String[] args) throws IOException {
}
final JsonObject parsed_doc = Json.parse(line).asObject();
final String id = parsed_doc.get("id").asString();
final int id_num = parsed_doc.get("id_num").asInt();
final String text = parsed_doc.get("text").asString();
idField.setStringValue(id);
idNumField.setIntValue(id_num);
textField.setStringValue(text);
writer.addDocument(document);
}
Expand Down
3 changes: 2 additions & 1 deletion engines/tantivy-0.19/src/bin/build_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use futures::executor::block_on;
use std::env;
use std::io::BufRead;
use std::path::Path;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::schema::{Schema, FAST, INDEXED, STORED, TEXT};
use tantivy::Index;

fn main() {
Expand All @@ -12,6 +12,7 @@ fn main() {

fn create_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("id_num", FAST | INDEXED);
schema_builder.add_text_field("id", STORED);
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
Expand Down
6 changes: 3 additions & 3 deletions format_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def generate_queries(words):
}

for line in fileinput.input():
(count, query) = PTN.split(line.decode("utf-8").strip(), 1)
(count, query) = PTN.split(line.strip(), 1)
count = int(count)
if not LETTERS_ONLY.match(query):
continue
words = PTN.split(query)
for q in generate_queries(words):
try:
qdoc = json.dumps(q).encode("utf-8")
print qdoc
qdoc = json.dumps(q)
print(qdoc)
except:
pass
6 changes: 6 additions & 0 deletions queries.txt
Original file line number Diff line number Diff line change
Expand Up @@ -897,3 +897,9 @@
{"query": "\"laborers international union of north america\"", "tags": ["phrase", "phrase:num_tokens_>3"]}
{"query": "laborers international union of north america", "tags": ["union", "global", "union:num_tokens_>3"]}
{"query": "+\"the who\" +uk", "tags": ["two-phase-critic"]}
{"query": "id_num:[48694410 TO 48694420] +griffith +observatory", "tags": ["range", "range_selective"]}
{"query": "id_num:[48694410 TO 48694420] +the", "tags": ["range", "range_selective"]}
{"query": "id_num:[48694410 TO 48694420] niceville high school", "tags": ["range", "range_selective"]}
{"query": "id_num:[0 TO 10000000] +griffith +observatory", "tags": ["range", "range_unselective"]}
{"query": "id_num:[0 TO 10000000] +the", "tags": ["range", "range_unselective"]}
{"query": "id_num:[0 TO 10000000] niceville high school", "tags": ["range", "range_unselective"]}
2 changes: 1 addition & 1 deletion results.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/build/results.json

Large diffs are not rendered by default.

0 comments on commit 64abd2d

Please sign in to comment.