Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Full text OCR search across all volumes (#944, #945) #953

Merged
merged 7 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ assets/upload/*
!assets/upload/index.html

*_dev
snippets

# profiler
*.profile
Expand Down
18 changes: 17 additions & 1 deletion apps/cms/templatetags/readux_templatetags.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,23 @@

register = Library()


@register.filter_function
def order_by(queryset, args):
args = [x.strip() for x in args.split(',')]
args = [x.strip() for x in args.split(",")]
return queryset.order_by(*args)


@register.filter
def dict_item(dictionary, key):
"""'Template filter to allow accessing dictionary value by variable key.
Example use::

{{ mydict|dict_item:keyvar }}
"""
# adapted from Princeton-CDH/geniza project https://github.com/Princeton-CDH/geniza/
try:
return dictionary[key]
except AttributeError:
# fail silently if something other than a dict is passed
return None
1 change: 1 addition & 0 deletions apps/iiif/canvases/management/commands/rebuild_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,5 @@ def __rebuild(self, canvas, testing=False):
anno.content = word['content']
anno.save()
prog_bar.next()
canvas.save()
prog_bar.finish()
15 changes: 7 additions & 8 deletions apps/iiif/canvases/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Django models representing IIIF canvases and IIIF image server info."""
import os
from functools import cached_property
from urllib.parse import quote
from boto3 import resource
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -144,23 +145,21 @@ def thumbnail_crop_volume(self):
# landscape
return f'{self.resource_id}/pct:25,15,50,85/,600/0/default.jpg'

@property
@cached_property
def result(self):
"""Empty attribute to hold the result of requests to get OCR data."""
words = Annotation.objects.filter(
owner=USER.objects.get(username='ocr'),
canvas=self.id).order_by('order')
"""Cached property containing OCR text content from associated annotations."""
words = self.annotation_set.filter(owner__username="ocr").order_by("order")
clean_words = []
for word in words:
clean_word = BeautifulSoup(word.content, 'html.parser').text
clean_word = BeautifulSoup(word.content, "html.parser").text
clean_words.append(clean_word)
return ' '.join(clean_words)
return " ".join(clean_words)

def save(self, *args, **kwargs): # pylint: disable = signature-differs
"""
Override save function to set `resource_id` add OCR,
set as manifest's `start_canvas` if manifest does not have one,
and set
and set position
"""
self.__check_image_server()

Expand Down
1 change: 1 addition & 0 deletions apps/iiif/canvases/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def add_ocr_task(canvas_id, *args, **kwargs):

if ocr is not None:
add_ocr_annotations(canvas, ocr)
canvas.save() # trigger reindex

@app.task(name='adding_oa_ocr_to_canvas', retry_backoff=5)
def add_oa_ocr_task(annotation_list_url):
Expand Down
44 changes: 36 additions & 8 deletions apps/iiif/manifests/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import analyzer
from django.db.models.query import Prefetch
from django.utils.html import strip_tags
from unidecode import unidecode

from apps.iiif.annotations.models import Annotation
from apps.iiif.canvases.models import Canvas
from apps.iiif.kollections.models import Collection
from .models import Manifest
from apps.iiif.manifests.models import Manifest

# TODO: Better English stemming (e.g. Rome to match Roman), multilingual stemming.
stemmer = analyzer(
Expand All @@ -25,9 +28,14 @@ class ManifestDocument(Document):
# fields to map explicitly in Elasticsearch
authors = fields.KeywordField(multi=True) # only used for faceting/filtering
author = fields.TextField() # only used for searching
collections = fields.NestedField(properties={
"label": fields.KeywordField(),
})
canvas_set = fields.NestedField(
properties={
"result": fields.TextField(analyzer=stemmer),
"position": fields.IntegerField(),
"pid": fields.KeywordField(),
}
) # canvas_set.result = OCR annotation text on each canvas
collections = fields.NestedField(properties={"label": fields.KeywordField()})
date_earliest = fields.DateField()
date_latest = fields.DateField()
has_pdf = fields.BooleanField()
Expand All @@ -38,10 +46,12 @@ class ManifestDocument(Document):

class Index:
"""Settings for Elasticsearch"""

name = "manifests"

class Django:
"""Settings for automatically pulling data from Django"""

model = Manifest

# fields to map dynamically in Elasticsearch
Expand All @@ -57,7 +67,7 @@ class Django:
"publisher",
"viewingdirection",
]
related_models = [Collection]
related_models = [Collection, Canvas]

def prepare_authors(self, instance):
"""convert authors string into list"""
Expand Down Expand Up @@ -88,12 +98,30 @@ def prepare_summary(self, instance):

def get_queryset(self):
"""prefetch related to improve performance"""
return super().get_queryset().prefetch_related(
"collections"
return (
super()
.get_queryset()
.prefetch_related(
"collections",
"image_server",
"languages",
Prefetch(
"canvas_set",
queryset=Canvas.objects.prefetch_related(
Prefetch(
"annotation_set",
queryset=Annotation.objects.select_related("owner"),
),
),
),
)
)

def get_instances_from_related(self, related_instance):
"""Retrieving item to index from related collections"""
"""Retrieving item to index from related objects"""
if isinstance(related_instance, Collection):
# many to many relationship
return related_instance.manifests.all()
elif isinstance(related_instance, Canvas):
# many to many relationship
return related_instance.manifest
15 changes: 15 additions & 0 deletions apps/readux/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,21 @@ class ManifestSearchForm(forms.Form):
},
),
)
scope = forms.ChoiceField(
label="Limit search to",
required=False,
initial="all",
choices=(
("all", "All"),
("metadata", "Metadata only"),
("text", "Textual contents only"),
),
widget=forms.Select(
attrs={
"class": "uk-select",
},
),
)
language = FacetedMultipleChoiceField(
label="Language",
required=False,
Expand Down
48 changes: 40 additions & 8 deletions apps/readux/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,42 +407,74 @@ def get_queryset(self):
form_data = form.cleaned_data

# default to empty string if no query in form data
search_query = form_data.get("q", "")
search_query = form_data.get("q") or ""
scope = form_data.get("scope") or "all"
if search_query:
multimatch_query = MultiMatch(query=search_query, fields=self.query_search_fields)
volumes = volumes.query(multimatch_query)
queries = []
if scope in ["all", "metadata"]:
# query for root level fields
multimatch_query = Q(
"multi_match", query=search_query, fields=self.query_search_fields
)
queries.append(multimatch_query)

if scope in ["all", "text"]:
# query for nested fields (i.e. canvas position and text)
nested_query = Q(
"nested",
path="canvas_set",
query=Q(
"multi_match",
query=search_query,
fields=["canvas_set.result"],
),
inner_hits={
"name": "canvases",
"size": 3, # max number of pages shown in full-text results
"highlight": {"fields": {"canvas_set.result": {}}},
},
# sum scores if in full text only search, so vols with most hits show up first.
# if also searching metadata, use avg (default) instead, to not over-inflate.
score_mode="sum" if scope == "text" else "avg",
)
queries.append(nested_query)

# combine them with bool: { should }
q = Q("bool", should=queries)
volumes = volumes.query(q)

# highlight
volumes = volumes.highlight_options(
require_field_match=False,
fragment_size=200,
number_of_fragments=10,
max_analyzed_offset=999999,
).highlight(
"label", "author", "summary"
)

# filter on authors
author_filter = form_data.get("author", "")
author_filter = form_data.get("author") or ""
if author_filter:
volumes = volumes.filter("terms", authors=author_filter)

# filter on languages
language_filter = form_data.get("language", "")
language_filter = form_data.get("language") or ""
if language_filter:
volumes = volumes.filter("terms", languages=language_filter)

# filter on collections
collection_filter = form_data.get("collection", "")
collection_filter = form_data.get("collection") or ""
if collection_filter:
volumes = volumes.filter("nested", path="collections", query=Q(
"terms", **{"collections.label": collection_filter}
))

# filter on date published
min_date_filter = form_data.get("start_date", "")
min_date_filter = form_data.get("start_date") or ""
if min_date_filter:
volumes = volumes.filter("range", date_earliest={"gte": min_date_filter})
max_date_filter = form_data.get("end_date", "")
max_date_filter = form_data.get("end_date") or ""
if max_date_filter:
volumes = volumes.filter("range", date_latest={"lte": max_date_filter})

Expand Down
42 changes: 41 additions & 1 deletion apps/static/css/project.css

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 13 additions & 8 deletions apps/templates/search_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,21 @@ <h1 class="uk-heading-medium uk-text-center">Search</h1>
accept-charset="utf-8"
>
<div class="uk-form uk-width-1-1">
<fieldset class="uk-margin uk-width-1-1">
<div class="uk-inline uk-width-1-1">
<span class="uk-form-icon" uk-icon="icon: search" aria-label="search"></span>
{{ form.q }}
<fieldset class="uk-margin uk-width-1-1 scope-and-keyword">
<div class="scope">
{{ form.scope }}
</div>
<div class="keyword">
<div class="uk-inline uk-width-1-1">
<span class="uk-form-icon" uk-icon="icon: search" aria-label="search"></span>
{{ form.q }}
</div>
</div>
<span class="uk-text-small">
Search for individual whole keywords. Multiple words will be searched as
'or' (e.g. Rome London = Rome or London).
</span>
</fieldset>
<span class="uk-text-small">
Search for individual whole keywords. Multiple words will be searched as
'or' (e.g. Rome London = Rome or London).
</span>
<fieldset class="uk-margin uk-width-1-1">
<div class="uk-form-label">{{ form.sort.label }}</div>
{{ form.sort }}
Expand Down
Loading