Skip to content

Commit

Permalink
In ingest meta, normalize only fields that match Manifest model
Browse files Browse the repository at this point in the history
  • Loading branch information
blms committed Dec 18, 2023
1 parent e2f35a6 commit 466d61a
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 17 deletions.
6 changes: 3 additions & 3 deletions apps/ingest/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .forms import BulkVolumeUploadForm
from .models import Bulk, IngestTaskWatcher, Local, Remote, S3Ingest
from .services import (clean_metadata, create_manifest, get_associated_meta,
get_metadata_from, lowercase_first_line)
get_metadata_from, normalize_header)

LOGGER = logging.getLogger(__name__)
class LocalAdmin(admin.ModelAdmin):
Expand Down Expand Up @@ -253,7 +253,7 @@ def clean(self):
csv_file = self.cleaned_data.get('metadata_spreadsheet')
if csv_file:
reader = csv.DictReader(
lowercase_first_line(
normalize_header(
StringIO(csv_file.read().decode('utf-8'))
),
)
Expand Down Expand Up @@ -284,7 +284,7 @@ def save_model(self, request, obj, form, change):
# Get spreadsheet with metadata to match each volume
obj.metadata_spreadsheet.seek(0)
metadata = csv.DictReader(
lowercase_first_line(
normalize_header(
StringIO(obj.metadata_spreadsheet.read().decode('utf-8'))
),
)
Expand Down
27 changes: 18 additions & 9 deletions apps/ingest/services.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Module of service classes and methods for ingest. """
import itertools
import re
from mimetypes import guess_type
from urllib.parse import unquote, urlparse

Expand All @@ -10,15 +11,21 @@


def clean_metadata(metadata):
print(metadata)
"""Remove keys that do not align with Manifest fields.
"""Normalize names of fields that align with Manifest fields.
:param metadata:
:type metadata: tablib.Dataset
:return: Dictionary with keys matching Manifest fields
:rtype: dict
"""
metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()}
fields = [f.name for f in Manifest._meta.get_fields()]
metadata = {
(
key.casefold().replace(" ", "_")
if key.casefold().replace(" ", "_") in fields
else key
): value for key, value in metadata.items()
}

for key in metadata.keys():
if key != 'metadata' and isinstance(metadata[key], list):
Expand Down Expand Up @@ -249,6 +256,7 @@ def get_associated_meta(all_metadata, file):
file_meta = {}
extless_filename = file.name[0:file.name.rindex('.')]
for meta_dict in all_metadata:
metadata_found_filename = None
for key, val in meta_dict.items():
if key.casefold() == 'filename':
metadata_found_filename = val
Expand All @@ -257,9 +265,10 @@ def get_associated_meta(all_metadata, file):
file_meta = meta_dict
return file_meta

def lowercase_first_line(iterator):
"""Lowercase the first line of a text file (such as the header row of a CSV)"""
return itertools.chain(
# ignore unicode characters, set lowercase, and strip whitespace
[next(iterator).encode('ascii', 'ignore').decode().casefold().strip()], iterator
)
def normalize_header(iterator):
"""Normalize the header row of a metadata CSV"""
# ignore unicode characters and strip whitespace
header_row = next(iterator).encode("ascii", "ignore").decode().strip()
# lowercase the word "pid" in this row so we can access it easily
header_row = re.sub(r"[Pp][Ii][Dd]", lambda m: m.group(0).casefold(), header_row)
return itertools.chain([header_row], iterator)
2 changes: 1 addition & 1 deletion apps/ingest/tests/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from apps.iiif.manifests.tests.factories import ManifestFactory
from .factories import RemoteFactory
from ..services import (clean_metadata, create_manifest, get_associated_meta,
get_metadata_from, lowercase_first_line)
get_metadata_from, normalize_header)


class RemoteTest(TestCase):
Expand Down
2 changes: 1 addition & 1 deletion apps/ingest/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory
from ..models import S3Ingest
from ..services import (clean_metadata, create_manifest, get_associated_meta,
get_metadata_from, lowercase_first_line)
get_metadata_from, normalize_header)

pytestmark = pytest.mark.django_db(transaction=True) # pylint: disable = invalid-name

Expand Down
4 changes: 1 addition & 3 deletions apps/ingest/tests/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ def setUp(self):
self.fixture_path = os.path.join(settings.APPS_DIR, 'ingest/fixtures/')

def test_cleaning_metadata(self):
""" It should normalize keys and remove key/value pairs that
do not match a Manifest field. """
""" It should normalize keys that match a Manifest field. """
fake_metadata = {
'pid': 'blm',
'invalid': 'trump',
Expand All @@ -40,7 +39,6 @@ def test_cleaning_metadata(self):

assert 'Published City' not in cleaned_metadata.keys()
assert 'PUBLISHER' not in cleaned_metadata.keys()
assert 'invalid' not in cleaned_metadata.keys()
assert cleaned_metadata['published_city'] == fake_metadata['Published City']
assert cleaned_metadata['publisher'] == fake_metadata['PUBLISHER']

Expand Down

0 comments on commit 466d61a

Please sign in to comment.