From f4470dc401e513cf178290977c20ae1b9d80eb23 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 3 Nov 2023 12:06:32 -0400 Subject: [PATCH 1/7] Index canvases and annotations on manifests (#944) --- apps/iiif/manifests/documents.py | 48 ++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/apps/iiif/manifests/documents.py b/apps/iiif/manifests/documents.py index f6f94e7cd..371c4d75f 100644 --- a/apps/iiif/manifests/documents.py +++ b/apps/iiif/manifests/documents.py @@ -4,11 +4,14 @@ from django_elasticsearch_dsl import Document, fields from django_elasticsearch_dsl.registries import registry from elasticsearch_dsl import analyzer +from django.db.models.query import Prefetch from django.utils.html import strip_tags from unidecode import unidecode +from apps.iiif.annotations.models import Annotation +from apps.iiif.canvases.models import Canvas from apps.iiif.kollections.models import Collection -from .models import Manifest +from apps.iiif.manifests.models import Manifest # TODO: Better English stemming (e.g. Rome to match Roman), multilingual stemming. stemmer = analyzer( @@ -25,9 +28,15 @@ class ManifestDocument(Document): # fields to map explicitly in Elasticsearch authors = fields.KeywordField(multi=True) # only used for faceting/filtering author = fields.TextField() # only used for searching - collections = fields.NestedField(properties={ - "label": fields.KeywordField(), - }) + canvas_set = fields.NestedField( + properties={ + "result": fields.TextField(analyzer=stemmer), + "position": fields.IntegerField(), + "thumbnail": fields.KeywordField(), + "pid": fields.KeywordField(), + } + ) # canvas_set.result = OCR annotation text on each canvas + collections = fields.NestedField(properties={"label": fields.KeywordField()}) date_earliest = fields.DateField() date_latest = fields.DateField() has_pdf = fields.BooleanField() @@ -38,10 +47,12 @@ class ManifestDocument(Document): class Index: """Settings for Elasticsearch""" + name = "manifests" class Django: """Settings for automatically pulling data from Django""" + model = Manifest # fields to map dynamically in Elasticsearch @@ -57,7 +68,7 @@ class Django: "publisher", "viewingdirection", ] - related_models = [Collection] + related_models = [Collection, Canvas, Annotation] def prepare_authors(self, instance): """convert authors string into list""" @@ -88,12 +99,33 @@ def prepare_summary(self, instance): def get_queryset(self): """prefetch related to improve performance""" - return super().get_queryset().prefetch_related( - "collections" + return ( + super() + .get_queryset() + .prefetch_related( + "collections", + "image_server", + "languages", + Prefetch( + "canvas_set", + queryset=Canvas.objects.prefetch_related( + Prefetch( + "annotation_set", + queryset=Annotation.objects.select_related("owner"), + ), + ), + ), + ) ) def get_instances_from_related(self, related_instance): - """Retrieving item to index from related collections""" + """Retrieving item to index from related objects""" if isinstance(related_instance, Collection): # many to many relationship return related_instance.manifests.all() + elif isinstance(related_instance, Canvas): + # many to many relationship + return related_instance.manifest + elif isinstance(related_instance, Annotation): + # many to many relationship + return related_instance.canvas.manifest From e81745a8dad75d691ee98219b0e9bb5365594458 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 3 Nov 2023 12:06:57 -0400 Subject: [PATCH 2/7] Clean up Canvas.result method (#944) --- apps/iiif/canvases/models.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/apps/iiif/canvases/models.py b/apps/iiif/canvases/models.py index e6f59e0f9..6032daed3 100644 --- a/apps/iiif/canvases/models.py +++ b/apps/iiif/canvases/models.py @@ -1,5 +1,6 @@ """Django models representing IIIF canvases and IIIF image server info.""" import os +from functools import cached_property from urllib.parse import quote from boto3 import resource from bs4 import BeautifulSoup @@ -144,23 +145,21 @@ def thumbnail_crop_volume(self): # landscape return f'{self.resource_id}/pct:25,15,50,85/,600/0/default.jpg' - @property + @cached_property def result(self): - """Empty attribute to hold the result of requests to get OCR data.""" - words = Annotation.objects.filter( - owner=USER.objects.get(username='ocr'), - canvas=self.id).order_by('order') + """Cached property containing OCR text content from associated annotations.""" + words = self.annotation_set.filter(owner__username="ocr").order_by("order") clean_words = [] for word in words: - clean_word = BeautifulSoup(word.content, 'html.parser').text + clean_word = BeautifulSoup(word.content, "html.parser").text clean_words.append(clean_word) - return ' '.join(clean_words) + return " ".join(clean_words) def save(self, *args, **kwargs): # pylint: disable = signature-differs """ Override save function to set `resource_id` add OCR, set as manifest's `start_canvas` if manifest does not have one, - and set + and set position """ self.__check_image_server() From d28483bd5d4c7de7f4c8cb367fe5a4ef697934de Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 3 Nov 2023 12:34:00 -0400 Subject: [PATCH 3/7] Add scope option to search (#944) --- apps/readux/forms.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/apps/readux/forms.py b/apps/readux/forms.py index b093a186c..3c2e810e0 100644 --- a/apps/readux/forms.py +++ b/apps/readux/forms.py @@ -57,6 +57,21 @@ class ManifestSearchForm(forms.Form): }, ), ) + scope = forms.ChoiceField( + label="Limit search to", + required=False, + initial="all", + choices=( + ("all", "All"), + ("metadata", "Metadata only"), + ("text", "Textual contents only"), + ), + widget=forms.Select( + attrs={ + "class": "uk-select", + }, + ), + ) language = FacetedMultipleChoiceField( label="Language", required=False, From 4bff60ad2c72f7ca3c3df42e082204a64ded715a Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 3 Nov 2023 12:34:26 -0400 Subject: [PATCH 4/7] Add full OCR text search and highlighting (#944) --- apps/readux/views.py | 46 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/apps/readux/views.py b/apps/readux/views.py index 0d3f24a7f..02f4d2ac3 100644 --- a/apps/readux/views.py +++ b/apps/readux/views.py @@ -407,42 +407,72 @@ def get_queryset(self): form_data = form.cleaned_data # default to empty string if no query in form data - search_query = form_data.get("q", "") + search_query = form_data.get("q") or "" + scope = form_data.get("scope") or "all" if search_query: - multimatch_query = MultiMatch(query=search_query, fields=self.query_search_fields) - volumes = volumes.query(multimatch_query) + queries = [] + if scope in ["all", "metadata"]: + # query for root level fields + multimatch_query = Q( + "multi_match", query=search_query, fields=self.query_search_fields + ) + queries.append(multimatch_query) + + if scope in ["all", "text"]: + # query for nested fields (i.e. canvas position and text) + nested_query = Q( + "nested", + path="canvas_set", + query=Q( + "multi_match", + query=search_query, + fields=["canvas_set.result"], + ), + inner_hits={ + "name": "canvases", + "size": 3, # max number of pages shown in full-text results + "sort": [{"canvas_set.position": {"order": "asc"}}], + "highlight": {"fields": {"canvas_set.result": {}}}, + }, + ) + queries.append(nested_query) + + # combine them with bool: { should } + q = Q("bool", should=queries) + volumes = volumes.query(q) # highlight volumes = volumes.highlight_options( require_field_match=False, fragment_size=200, number_of_fragments=10, + max_analyzed_offset=999999, ).highlight( "label", "author", "summary" ) # filter on authors - author_filter = form_data.get("author", "") + author_filter = form_data.get("author") or "" if author_filter: volumes = volumes.filter("terms", authors=author_filter) # filter on languages - language_filter = form_data.get("language", "") + language_filter = form_data.get("language") or "" if language_filter: volumes = volumes.filter("terms", languages=language_filter) # filter on collections - collection_filter = form_data.get("collection", "") + collection_filter = form_data.get("collection") or "" if collection_filter: volumes = volumes.filter("nested", path="collections", query=Q( "terms", **{"collections.label": collection_filter} )) # filter on date published - min_date_filter = form_data.get("start_date", "") + min_date_filter = form_data.get("start_date") or "" if min_date_filter: volumes = volumes.filter("range", date_earliest={"gte": min_date_filter}) - max_date_filter = form_data.get("end_date", "") + max_date_filter = form_data.get("end_date") or "" if max_date_filter: volumes = volumes.filter("range", date_latest={"lte": max_date_filter}) From 772546e460ac7d4783b037ae64e917123b1e53f4 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 3 Nov 2023 12:35:34 -0400 Subject: [PATCH 5/7] Add scope and full text search to frontend (#944) --- .gitignore | 1 - apps/cms/templatetags/readux_templatetags.py | 18 ++++++++- apps/static/css/project.css | 39 +++++++++++++++++++- apps/templates/search_results.html | 21 +++++++---- apps/templates/snippets/volume_result.html | 29 ++++++++++++--- 5 files changed, 92 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 48a309237..5a9b95519 100644 --- a/.gitignore +++ b/.gitignore @@ -146,7 +146,6 @@ assets/upload/* !assets/upload/index.html *_dev -snippets # profiler *.profile diff --git a/apps/cms/templatetags/readux_templatetags.py b/apps/cms/templatetags/readux_templatetags.py index e9b708e0b..66d0e3d2b 100644 --- a/apps/cms/templatetags/readux_templatetags.py +++ b/apps/cms/templatetags/readux_templatetags.py @@ -2,7 +2,23 @@ register = Library() + @register.filter_function def order_by(queryset, args): - args = [x.strip() for x in args.split(',')] + args = [x.strip() for x in args.split(",")] return queryset.order_by(*args) + + +@register.filter +def dict_item(dictionary, key): + """'Template filter to allow accessing dictionary value by variable key. + Example use:: + + {{ mydict|dict_item:keyvar }} + """ + # adapted from Princeton-CDH/geniza project https://github.com/Princeton-CDH/geniza/ + try: + return dictionary[key] + except AttributeError: + # fail silently if something other than a dict is passed + return None diff --git a/apps/static/css/project.css b/apps/static/css/project.css index cfef32f3b..840b2e05e 100644 --- a/apps/static/css/project.css +++ b/apps/static/css/project.css @@ -41,6 +41,20 @@ em { border: 1px solid transparent; } +#search-form .scope-and-keyword { + display: flex; + flex-flow: row nowrap; + margin: 1rem 0 0; + gap: 0.5rem; +} + +#search-form .scope-and-keyword .scope { + flex: 0 1 auto; +} +#search-form .scope-and-keyword .keyword { + flex: 1 0 auto; +} + .uk-tab > .uk-active > a { border-color: var(--link-color) !important; color: var(--link-color) !important; } @@ -1829,12 +1843,35 @@ ol#search-results dl { margin-left: 2rem; } /* Clamp summary to 3 lines */ -ol#search-results dd.result-volume-summary { +ol#search-results .result-volume-summary { display: -webkit-box; -webkit-box-orient: vertical; -webkit-line-clamp: 3; overflow: hidden; } +/* search result highlighting */ +ol#search-results em { + color: #510029 !important; + font-weight: bold; + font-style: normal; } + +/* results within full text */ +ol#search-results .result-page a { + display: flex; + flex-flow: row nowrap; + justify-content: flex-start; + align-items: center; + font-size: 0.8rem; + gap: 1rem; + padding: 0.25rem 1rem; } + +ol#search-results .result-page .page-number { + min-width: 2rem; + max-width: 2rem;} +ol#search-results .result-page img { + max-width: 100px; + max-height: 100px; } + .sr-only { position: absolute; width: 1px; diff --git a/apps/templates/search_results.html b/apps/templates/search_results.html index 091a3f2e7..a08889a76 100644 --- a/apps/templates/search_results.html +++ b/apps/templates/search_results.html @@ -48,16 +48,21 @@

Search

accept-charset="utf-8" >
-
-
- - {{ form.q }} +
+
+ {{ form.scope }} +
+
+
+ + {{ form.q }} +
- - Search for individual whole keywords. Multiple words will be searched as - 'or' (e.g. Rome London = Rome or London). -
+ + Search for individual whole keywords. Multiple words will be searched as + 'or' (e.g. Rome London = Rome or London). +
{{ form.sort.label }}
{{ form.sort }} diff --git a/apps/templates/snippets/volume_result.html b/apps/templates/snippets/volume_result.html index f8ba8bd09..7d3633bd1 100644 --- a/apps/templates/snippets/volume_result.html +++ b/apps/templates/snippets/volume_result.html @@ -1,3 +1,4 @@ +{% load readux_templatetags %}
  • @@ -7,7 +8,7 @@

    class="nav-link" href="{% url 'volumeall' volume.pid %}" > - {% if volume.meta.highlight.label %} + {% if 'highlight' in volume.meta and volume.meta.highlight.label %} {% for fragment in volume.meta.highlight.label %} {{ fragment|safe }} {% endfor %} @@ -16,7 +17,7 @@

    {% endif %} {% else %} - {% if volume.meta.highlight.label %} + {% if 'highlight' in volume.meta and volume.meta.highlight.label %} {% for fragment in volume.meta.highlight.label %} {{ fragment|safe }} {% endfor %} @@ -34,7 +35,7 @@

    Author{{volume.authors|pluralize}}