Skip to content

Commit

Permalink
Add full OCR text search and highlighting (#944)
Browse files Browse the repository at this point in the history
  • Loading branch information
blms committed Nov 3, 2023
1 parent d28483b commit 4bff60a
Showing 1 changed file with 38 additions and 8 deletions.
46 changes: 38 additions & 8 deletions apps/readux/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,42 +407,72 @@ def get_queryset(self):
form_data = form.cleaned_data

# default to empty string if no query in form data
search_query = form_data.get("q", "")
search_query = form_data.get("q") or ""
scope = form_data.get("scope") or "all"
if search_query:
multimatch_query = MultiMatch(query=search_query, fields=self.query_search_fields)
volumes = volumes.query(multimatch_query)
queries = []
if scope in ["all", "metadata"]:
# query for root level fields
multimatch_query = Q(
"multi_match", query=search_query, fields=self.query_search_fields
)
queries.append(multimatch_query)

if scope in ["all", "text"]:
# query for nested fields (i.e. canvas position and text)
nested_query = Q(
"nested",
path="canvas_set",
query=Q(
"multi_match",
query=search_query,
fields=["canvas_set.result"],
),
inner_hits={
"name": "canvases",
"size": 3, # max number of pages shown in full-text results
"sort": [{"canvas_set.position": {"order": "asc"}}],
"highlight": {"fields": {"canvas_set.result": {}}},
},
)
queries.append(nested_query)

# combine them with bool: { should }
q = Q("bool", should=queries)
volumes = volumes.query(q)

# highlight
volumes = volumes.highlight_options(
require_field_match=False,
fragment_size=200,
number_of_fragments=10,
max_analyzed_offset=999999,
).highlight(
"label", "author", "summary"
)

# filter on authors
author_filter = form_data.get("author", "")
author_filter = form_data.get("author") or ""
if author_filter:
volumes = volumes.filter("terms", authors=author_filter)

# filter on languages
language_filter = form_data.get("language", "")
language_filter = form_data.get("language") or ""
if language_filter:
volumes = volumes.filter("terms", languages=language_filter)

# filter on collections
collection_filter = form_data.get("collection", "")
collection_filter = form_data.get("collection") or ""
if collection_filter:
volumes = volumes.filter("nested", path="collections", query=Q(
"terms", **{"collections.label": collection_filter}
))

# filter on date published
min_date_filter = form_data.get("start_date", "")
min_date_filter = form_data.get("start_date") or ""
if min_date_filter:
volumes = volumes.filter("range", date_earliest={"gte": min_date_filter})
max_date_filter = form_data.get("end_date", "")
max_date_filter = form_data.get("end_date") or ""
if max_date_filter:
volumes = volumes.filter("range", date_latest={"lte": max_date_filter})

Expand Down

0 comments on commit 4bff60a

Please sign in to comment.