ecds · jayvarner · Nov 15, 2023 · Nov 3, 2023 · Nov 3, 2023 · Nov 3, 2023
diff --git a/.gitignore b/.gitignore
@@ -146,7 +146,6 @@ assets/upload/*
 !assets/upload/index.html
 
 *_dev
-snippets
 
 # profiler
 *.profile

diff --git a/apps/cms/templatetags/readux_templatetags.py b/apps/cms/templatetags/readux_templatetags.py
@@ -2,7 +2,23 @@
 
 register = Library()
 
+
 @register.filter_function
 def order_by(queryset, args):
-    args = [x.strip() for x in args.split(',')]
+    args = [x.strip() for x in args.split(",")]
     return queryset.order_by(*args)
+
+
+@register.filter
+def dict_item(dictionary, key):
+    """'Template filter to allow accessing dictionary value by variable key.
+    Example use::
+
+        {{ mydict|dict_item:keyvar }}
+    """
+    # adapted from Princeton-CDH/geniza project https://github.com/Princeton-CDH/geniza/
+    try:
+        return dictionary[key]
+    except AttributeError:
+        # fail silently if something other than a dict is passed
+        return None
diff --git a/apps/iiif/canvases/management/commands/rebuild_ocr.py b/apps/iiif/canvases/management/commands/rebuild_ocr.py
@@ -123,4 +123,5 @@ def __rebuild(self, canvas, testing=False):
                     anno.content = word['content']
                     anno.save()
                     prog_bar.next()
+                canvas.save()
                 prog_bar.finish()
diff --git a/apps/iiif/canvases/models.py b/apps/iiif/canvases/models.py
@@ -1,5 +1,6 @@
 """Django models representing IIIF canvases and IIIF image server info."""
 import os
+from functools import cached_property
 from urllib.parse import quote
 from boto3 import resource
 from bs4 import BeautifulSoup
@@ -144,23 +145,21 @@ def thumbnail_crop_volume(self):
         # landscape
         return f'{self.resource_id}/pct:25,15,50,85/,600/0/default.jpg'
 
-    @property
+    @cached_property
     def result(self):
-        """Empty attribute to hold the result of requests to get OCR data."""
-        words = Annotation.objects.filter(
-            owner=USER.objects.get(username='ocr'),
-            canvas=self.id).order_by('order')
+        """Cached property containing OCR text content from associated annotations."""
+        words = self.annotation_set.filter(owner__username="ocr").order_by("order")
         clean_words = []
         for word in words:
-            clean_word = BeautifulSoup(word.content, 'html.parser').text
+            clean_word = BeautifulSoup(word.content, "html.parser").text
             clean_words.append(clean_word)
-        return ' '.join(clean_words)
+        return " ".join(clean_words)
 
     def save(self, *args, **kwargs): # pylint: disable = signature-differs
         """
         Override save function to set `resource_id` add OCR,
         set as manifest's `start_canvas` if manifest does not have one,
-        and set
+        and set position
         """
         self.__check_image_server()
 

diff --git a/apps/iiif/canvases/tasks.py b/apps/iiif/canvases/tasks.py
@@ -16,6 +16,7 @@ def add_ocr_task(canvas_id, *args, **kwargs):
 
     if ocr is not None:
         add_ocr_annotations(canvas, ocr)
+        canvas.save()  # trigger reindex
 
 @app.task(name='adding_oa_ocr_to_canvas', retry_backoff=5)
 def add_oa_ocr_task(annotation_list_url):

diff --git a/apps/iiif/manifests/documents.py b/apps/iiif/manifests/documents.py
@@ -4,11 +4,14 @@
 from django_elasticsearch_dsl import Document, fields
 from django_elasticsearch_dsl.registries import registry
 from elasticsearch_dsl import analyzer
+from django.db.models.query import Prefetch
 from django.utils.html import strip_tags
 from unidecode import unidecode
 
+from apps.iiif.annotations.models import Annotation
+from apps.iiif.canvases.models import Canvas
 from apps.iiif.kollections.models import Collection
-from .models import Manifest
+from apps.iiif.manifests.models import Manifest
 
 # TODO: Better English stemming (e.g. Rome to match Roman), multilingual stemming.
 stemmer = analyzer(
@@ -25,9 +28,14 @@ class ManifestDocument(Document):
     # fields to map explicitly in Elasticsearch
     authors = fields.KeywordField(multi=True)  # only used for faceting/filtering
     author = fields.TextField()  # only used for searching
-    collections = fields.NestedField(properties={
-        "label": fields.KeywordField(),
-    })
+    canvas_set = fields.NestedField(
+        properties={
+            "result": fields.TextField(analyzer=stemmer),
+            "position": fields.IntegerField(),
+            "pid": fields.KeywordField(),
+        }
+    )  # canvas_set.result = OCR annotation text on each canvas
+    collections = fields.NestedField(properties={"label": fields.KeywordField()})
     date_earliest = fields.DateField()
     date_latest = fields.DateField()
     has_pdf = fields.BooleanField()
@@ -38,10 +46,12 @@ class ManifestDocument(Document):
 
     class Index:
         """Settings for Elasticsearch"""
+
         name = "manifests"
 
     class Django:
         """Settings for automatically pulling data from Django"""
+
         model = Manifest
 
         # fields to map dynamically in Elasticsearch
@@ -57,7 +67,7 @@ class Django:
             "publisher",
             "viewingdirection",
         ]
-        related_models = [Collection]
+        related_models = [Collection, Canvas]
 
     def prepare_authors(self, instance):
         """convert authors string into list"""
@@ -88,12 +98,30 @@ def prepare_summary(self, instance):
 
     def get_queryset(self):
         """prefetch related to improve performance"""
-        return super().get_queryset().prefetch_related(
-            "collections"
+        return (
+            super()
+            .get_queryset()
+            .prefetch_related(
+                "collections",
+                "image_server",
+                "languages",
+                Prefetch(
+                    "canvas_set",
+                    queryset=Canvas.objects.prefetch_related(
+                        Prefetch(
+                            "annotation_set",
+                            queryset=Annotation.objects.select_related("owner"),
+                        ),
+                    ),
+                ),
+            )
         )
 
     def get_instances_from_related(self, related_instance):
-        """Retrieving item to index from related collections"""
+        """Retrieving item to index from related objects"""
         if isinstance(related_instance, Collection):
             # many to many relationship
             return related_instance.manifests.all()
+        elif isinstance(related_instance, Canvas):
+            # many to many relationship
+            return related_instance.manifest
diff --git a/apps/readux/forms.py b/apps/readux/forms.py
@@ -57,6 +57,21 @@ class ManifestSearchForm(forms.Form):
             },
         ),
     )
+    scope = forms.ChoiceField(
+        label="Limit search to",
+        required=False,
+        initial="all",
+        choices=(
+            ("all", "All"),
+            ("metadata", "Metadata only"),
+            ("text", "Textual contents only"),
+        ),
+        widget=forms.Select(
+            attrs={
+                "class": "uk-select",
+            },
+        ),
+    )
     language = FacetedMultipleChoiceField(
         label="Language",
         required=False,

diff --git a/apps/readux/views.py b/apps/readux/views.py
@@ -407,42 +407,74 @@ def get_queryset(self):
         form_data = form.cleaned_data
 
         # default to empty string if no query in form data
-        search_query = form_data.get("q", "")
+        search_query = form_data.get("q") or ""
+        scope = form_data.get("scope") or "all"
         if search_query:
-            multimatch_query = MultiMatch(query=search_query, fields=self.query_search_fields)
-            volumes = volumes.query(multimatch_query)
+            queries = []
+            if scope in ["all", "metadata"]:
+                # query for root level fields
+                multimatch_query = Q(
+                    "multi_match", query=search_query, fields=self.query_search_fields
+                )
+                queries.append(multimatch_query)
+
+            if scope in ["all", "text"]:
+                # query for nested fields (i.e. canvas position and text)
+                nested_query = Q(
+                    "nested",
+                    path="canvas_set",
+                    query=Q(
+                        "multi_match",
+                        query=search_query,
+                        fields=["canvas_set.result"],
+                    ),
+                    inner_hits={
+                        "name": "canvases",
+                        "size": 3,  # max number of pages shown in full-text results
+                        "highlight": {"fields": {"canvas_set.result": {}}},
+                    },
+                    # sum scores if in full text only search, so vols with most hits show up first.
+                    # if also searching metadata, use avg (default) instead, to not over-inflate.
+                    score_mode="sum" if scope == "text" else "avg",
+                )
+                queries.append(nested_query)
+
+            # combine them with bool: { should }
+            q = Q("bool", should=queries)
+            volumes = volumes.query(q)
 
         # highlight
         volumes = volumes.highlight_options(
             require_field_match=False,
             fragment_size=200,
             number_of_fragments=10,
+            max_analyzed_offset=999999,
         ).highlight(
             "label", "author", "summary"
         )
 
         # filter on authors
-        author_filter = form_data.get("author", "")
+        author_filter = form_data.get("author") or ""
         if author_filter:
             volumes = volumes.filter("terms", authors=author_filter)
 
         # filter on languages
-        language_filter = form_data.get("language", "")
+        language_filter = form_data.get("language") or ""
         if language_filter:
             volumes = volumes.filter("terms", languages=language_filter)
 
         # filter on collections
-        collection_filter = form_data.get("collection", "")
+        collection_filter = form_data.get("collection") or ""
         if collection_filter:
             volumes = volumes.filter("nested", path="collections", query=Q(
                 "terms", **{"collections.label": collection_filter}
             ))
 
         # filter on date published
-        min_date_filter = form_data.get("start_date", "")
+        min_date_filter = form_data.get("start_date") or ""
         if min_date_filter:
             volumes = volumes.filter("range", date_earliest={"gte": min_date_filter})
-        max_date_filter = form_data.get("end_date", "")
+        max_date_filter = form_data.get("end_date") or ""
         if max_date_filter:
             volumes = volumes.filter("range", date_latest={"lte": max_date_filter})
 

diff --git a/apps/static/css/project.css b/apps/static/css/project.css
diff --git a/apps/templates/search_results.html b/apps/templates/search_results.html
@@ -48,16 +48,21 @@ <h1 class="uk-heading-medium uk-text-center">Search</h1>
         accept-charset="utf-8"
     >
         <div class="uk-form uk-width-1-1">
-            <fieldset class="uk-margin uk-width-1-1">
-                <div class="uk-inline uk-width-1-1">
-                    <span class="uk-form-icon" uk-icon="icon: search" aria-label="search"></span>
-                    {{ form.q }}
+            <fieldset class="uk-margin uk-width-1-1 scope-and-keyword">
+                <div class="scope">
+                    {{ form.scope }}
+                </div>
+                <div class="keyword">
+                    <div class="uk-inline uk-width-1-1">
+                        <span class="uk-form-icon" uk-icon="icon: search" aria-label="search"></span>
+                        {{ form.q }}
+                    </div>
                 </div>
-                <span class="uk-text-small">
-                    Search for individual whole keywords. Multiple words will be searched as
-                    'or' (e.g. Rome London = Rome or London).
-                </span>
             </fieldset>
+            <span class="uk-text-small">
+                Search for individual whole keywords. Multiple words will be searched as
+                'or' (e.g. Rome London = Rome or London).
+            </span>
             <fieldset class="uk-margin uk-width-1-1">
                 <div class="uk-form-label">{{ form.sort.label }}</div>
                 {{ form.sort }}