diff --git a/apps/ingest/admin.py b/apps/ingest/admin.py index bd9013d5..1536e6fc 100644 --- a/apps/ingest/admin.py +++ b/apps/ingest/admin.py @@ -18,7 +18,7 @@ from .forms import BulkVolumeUploadForm from .models import Bulk, IngestTaskWatcher, Local, Remote, S3Ingest from .services import (clean_metadata, create_manifest, get_associated_meta, - get_metadata_from, lowercase_first_line) + get_metadata_from, normalize_header) LOGGER = logging.getLogger(__name__) class LocalAdmin(admin.ModelAdmin): @@ -253,7 +253,7 @@ def clean(self): csv_file = self.cleaned_data.get('metadata_spreadsheet') if csv_file: reader = csv.DictReader( - lowercase_first_line( + normalize_header( StringIO(csv_file.read().decode('utf-8')) ), ) @@ -284,7 +284,7 @@ def save_model(self, request, obj, form, change): # Get spreadsheet with metadata to match each volume obj.metadata_spreadsheet.seek(0) metadata = csv.DictReader( - lowercase_first_line( + normalize_header( StringIO(obj.metadata_spreadsheet.read().decode('utf-8')) ), ) diff --git a/apps/ingest/services.py b/apps/ingest/services.py index 6d258541..8e1cec7c 100644 --- a/apps/ingest/services.py +++ b/apps/ingest/services.py @@ -1,5 +1,6 @@ """ Module of service classes and methods for ingest. """ import itertools +import re from mimetypes import guess_type from urllib.parse import unquote, urlparse @@ -10,15 +11,21 @@ def clean_metadata(metadata): - print(metadata) - """Remove keys that do not align with Manifest fields. + """Normalize names of fields that align with Manifest fields. :param metadata: :type metadata: tablib.Dataset :return: Dictionary with keys matching Manifest fields :rtype: dict """ - metadata = {key.casefold().replace(' ', '_'): value for key, value in metadata.items()} + fields = [f.name for f in Manifest._meta.get_fields()] + metadata = { + ( + key.casefold().replace(" ", "_") + if key.casefold().replace(" ", "_") in fields + else key + ): value for key, value in metadata.items() + } for key in metadata.keys(): if key != 'metadata' and isinstance(metadata[key], list): @@ -249,6 +256,7 @@ def get_associated_meta(all_metadata, file): file_meta = {} extless_filename = file.name[0:file.name.rindex('.')] for meta_dict in all_metadata: + metadata_found_filename = None for key, val in meta_dict.items(): if key.casefold() == 'filename': metadata_found_filename = val @@ -257,9 +265,10 @@ def get_associated_meta(all_metadata, file): file_meta = meta_dict return file_meta -def lowercase_first_line(iterator): - """Lowercase the first line of a text file (such as the header row of a CSV)""" - return itertools.chain( - # ignore unicode characters, set lowercase, and strip whitespace - [next(iterator).encode('ascii', 'ignore').decode().casefold().strip()], iterator - ) +def normalize_header(iterator): + """Normalize the header row of a metadata CSV""" + # ignore unicode characters and strip whitespace + header_row = next(iterator).encode("ascii", "ignore").decode().strip() + # lowercase the word "pid" in this row so we can access it easily + header_row = re.sub(r"[Pp][Ii][Dd]", lambda m: m.group(0).casefold(), header_row) + return itertools.chain([header_row], iterator) diff --git a/apps/ingest/tests/test_remote.py b/apps/ingest/tests/test_remote.py index 630d8071..c23f7770 100644 --- a/apps/ingest/tests/test_remote.py +++ b/apps/ingest/tests/test_remote.py @@ -6,7 +6,7 @@ from apps.iiif.manifests.tests.factories import ManifestFactory from .factories import RemoteFactory from ..services import (clean_metadata, create_manifest, get_associated_meta, - get_metadata_from, lowercase_first_line) + get_metadata_from, normalize_header) class RemoteTest(TestCase): diff --git a/apps/ingest/tests/test_s3.py b/apps/ingest/tests/test_s3.py index 7f84bb6f..f7e3895c 100644 --- a/apps/ingest/tests/test_s3.py +++ b/apps/ingest/tests/test_s3.py @@ -17,7 +17,7 @@ from apps.iiif.manifests.tests.factories import ManifestFactory, ImageServerFactory from ..models import S3Ingest from ..services import (clean_metadata, create_manifest, get_associated_meta, - get_metadata_from, lowercase_first_line) + get_metadata_from, normalize_header) pytestmark = pytest.mark.django_db(transaction=True) # pylint: disable = invalid-name diff --git a/apps/ingest/tests/test_services.py b/apps/ingest/tests/test_services.py index 1273e77a..cf9fac8d 100644 --- a/apps/ingest/tests/test_services.py +++ b/apps/ingest/tests/test_services.py @@ -21,8 +21,7 @@ def setUp(self): self.fixture_path = os.path.join(settings.APPS_DIR, 'ingest/fixtures/') def test_cleaning_metadata(self): - """ It should normalize keys and remove key/value pairs that - do not match a Manifest field. """ + """ It should normalize keys that match a Manifest field. """ fake_metadata = { 'pid': 'blm', 'invalid': 'trump', @@ -40,7 +39,6 @@ def test_cleaning_metadata(self): assert 'Published City' not in cleaned_metadata.keys() assert 'PUBLISHER' not in cleaned_metadata.keys() - assert 'invalid' not in cleaned_metadata.keys() assert cleaned_metadata['published_city'] == fake_metadata['Published City'] assert cleaned_metadata['publisher'] == fake_metadata['PUBLISHER']