From e1b5f324b11f2b7a94068c5d64839164aa9720ac Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 27 May 2024 14:37:23 +0200 Subject: [PATCH] [#56] Add most DCAT AP 1.1 standard and list fields --- ckanext/dcat/schemas/dcat_ap_2.1.yaml | 139 ++++++++++++++----- ckanext/dcat/tests/test_scheming_support.py | 144 +++++++++++++++++++- ckanext/dcat/tests/utils.py | 3 + 3 files changed, 246 insertions(+), 40 deletions(-) diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml index 20edc599..a3ddf67a 100644 --- a/ckanext/dcat/schemas/dcat_ap_2.1.yaml +++ b/ckanext/dcat/schemas/dcat_ap_2.1.yaml @@ -23,8 +23,23 @@ dataset_fields: form_snippet: markdown.html form_placeholder: eg. Some useful notes about the data +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + - field_name: tag_string - label: Tags + label: Keywords preset: tag_string_autocomplete form_placeholder: eg. economy, mental health, government @@ -48,55 +63,70 @@ dataset_fields: validators: ignore_missing unicode_safe package_version_validator form_placeholder: '1.0' -- field_name: author - label: Author - form_placeholder: Joe Bloggs - display_property: dc:creator - -- field_name: author_email - label: Author Email - form_placeholder: joe@example.com - display_property: dc:creator - display_snippet: email.html - display_email_name_field: author - -- field_name: maintainer - label: Maintainer - form_placeholder: Joe Bloggs - display_property: dc:contributor - -- field_name: maintainer_email - label: Maintainer Email - form_placeholder: joe@example.com - display_property: dc:contributor - display_snippet: email.html - display_email_name_field: maintainer +# Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + # TODO: dcat_date preset -- field_name: contact - label: Contact points - repeating_label: Contact point - repeating_subfields: +# Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + # TODO: dcat_date preset - - field_name: uri - label: URI +- field_name: identifier + label: Identifier - - field_name: name - label: Name +- field_name: frequency + label: Frequency - - field_name: email - label: Email +- field_name: provenance + label: Provenance + +- field_name: dcat_type + label: Type + # TODO: controlled vocabulary? + # +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + form_snippet: markdown.html - field_name: version_notes label: Version notes validators: ignore_missing unicode_safe form_snippet: markdown.html +- field_name: alternate_identifier + label: Alternate identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + - field_name: conforms_to label: Conforms to preset: multiple_text validators: ignore_missing scheming_multiple_text +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) resource_fields: - field_name: url @@ -116,15 +146,51 @@ resource_fields: label: Format preset: resource_format_autocomplete +- field_name: size + label: Size + # TODO: number validator / snippet + - field_name: rights label: Rights form_snippet: markdown.html form_placeholder: Some statement about the rights associated with the resource +- field_name: status + label: Status + +- field_name: license + label: License + +# Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + +# Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + +- field_name: issued + label: Release date + # TODO: dcat_date preset + +- field_name: modified + label: Modification date + # TODO: dcat_date preset + - field_name: language label: Language preset: multiple_text +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + - field_name: access_services label: Access services repeating_label: Access service @@ -139,3 +205,8 @@ resource_fields: - field_name: endpoint_url label: Endpoint URL preset: multiple_text + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + diff --git a/ckanext/dcat/tests/test_scheming_support.py b/ckanext/dcat/tests/test_scheming_support.py index 88d73e3f..3d12c506 100644 --- a/ckanext/dcat/tests/test_scheming_support.py +++ b/ckanext/dcat/tests/test_scheming_support.py @@ -1,6 +1,7 @@ import pytest from rdflib.namespace import RDF +from rdflib.term import URIRef from ckan.tests.helpers import call_action @@ -50,8 +51,23 @@ def test_e2e_ckan_to_dcat(self): "version": "1.0b", "tags": [{"name": "Tag 1"}, {"name": "Tag 2"}], # Standard fields + "issued": "2024-05-01", + "modified": "2024-05-05", + "identifier": "xx-some-dataset-id-yy", + "frequency": "monthly", + "provenance": "Statement about provenance", + "dcat_type": "test-type", "version_notes": "Some version notes", + "access_rights": "Statement about access rights", # List fields (lists) + "alternate_identifier": ["alt-id-1", "alt-id-2"], + "theme": [ + "https://example.org/uri/theme1", + "https://example.org/uri/theme2", + "https://example.org/uri/theme3", + ], + "language": ["en", "ca", "es"], + "documentation": ["https://example.org/some-doc.html"], "conforms_to": ["Standard 1", "Standard 2"], # Repeating subfields "contact": [ @@ -63,6 +79,12 @@ def test_e2e_ckan_to_dcat(self): "name": "Resource 1", "url": "https://example.com/data.csv", "format": "CSV", + "status": "published", + "access_url": "https://example.com/data.csv", + "download_url": "https://example.com/data.csv", + "issued": "2024-05-01T01:20:33", + "modified": "2024-05-05T09:33:20", + "license": "http://creativecommons.org/licenses/by/3.0/", "rights": "Some stament about rights", "language": ["en", "ca", "es"], "access_services": [ @@ -95,16 +117,53 @@ def test_e2e_ckan_to_dcat(self): assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset) assert self._triple(g, dataset_ref, DCT.title, dataset["title"]) assert self._triple(g, dataset_ref, DCT.description, dataset["notes"]) + assert self._triple(g, dataset_ref, OWL.versionInfo, dataset["version"]) # Standard fields + assert self._triple(g, dataset_ref, DCT.identifier, dataset["identifier"]) + assert self._triple( + g, dataset_ref, DCT.accrualPeriodicity, dataset["frequency"] + ) + assert self._triple(g, dataset_ref, DCT.provenance, dataset["provenance"]) + assert self._triple(g, dataset_ref, DCT.type, dataset["dcat_type"]) assert self._triple(g, dataset_ref, ADMS.versionNotes, dataset["version_notes"]) + assert self._triple(g, dataset_ref, DCT.accessRights, dataset["access_rights"]) + + # Dates + assert self._triple( + g, + dataset_ref, + DCT.issued, + dataset["issued"] + "T00:00:00", + data_type=XSD.dateTime, + ) + assert self._triple( + g, + dataset_ref, + DCT.modified, + dataset["modified"] + "T00:00:00", + data_type=XSD.dateTime, + ) # List fields - # TODO helper function - conforms_to = [ - str(t[2]) for t in g.triples((dataset_ref, DCT.conformsTo, None)) - ] - assert conforms_to == dataset["conforms_to"] + + assert ( + self._triples_list_values(g, dataset_ref, DCT.conformsTo) + == dataset["conforms_to"] + ) + assert ( + self._triples_list_values(g, dataset_ref, ADMS.identifier) + == dataset["alternate_identifier"] + ) + assert self._triples_list_values(g, dataset_ref, DCAT.theme) == dataset["theme"] + assert ( + self._triples_list_values(g, dataset_ref, DCT.language) + == dataset["language"] + ) + assert ( + self._triples_list_values(g, dataset_ref, FOAF.page) + == dataset["documentation"] + ) # Repeating subfields @@ -137,6 +196,37 @@ def test_e2e_ckan_to_dcat(self): assert self._triple( g, distribution_ref, DCT.rights, dataset_dict["resources"][0]["rights"] ) + assert self._triple( + g, distribution_ref, DCT.status, dataset_dict["resources"][0]["status"] + ) + assert self._triple( + g, + distribution_ref, + DCAT.accessURL, + dataset_dict["resources"][0]["access_url"], + ) + assert self._triple( + g, + distribution_ref, + DCAT.downloadURL, + dataset_dict["resources"][0]["download_url"], + ) + + # Resources: dates + assert self._triple( + g, + distribution_ref, + DCT.issued, + dataset["resources"][0]["issued"], + data_type=XSD.dateTime, + ) + assert self._triple( + g, + distribution_ref, + DCT.modified, + dataset["resources"][0]["modified"], + data_type=XSD.dateTime, + ) # Resources: list fields @@ -216,9 +306,31 @@ def test_e2e_dcat_to_ckan(self): # Standard fields assert dataset["version_notes"] == "New schema added" + assert dataset["identifier"] == u"9df8df51-63db-37a8-e044-0003ba9b0d98" + assert dataset["frequency"] == "http://purl.org/cld/freq/daily" + assert dataset["access_rights"] == "public" + assert dataset["provenance"] == "Some statement about provenance" + assert dataset["dcat_type"] == "test-type" + + assert dataset["issued"] == u"2012-05-10" + assert dataset["modified"] == u"2012-05-10T21:04:00" # List fields - assert dataset["conforms_to"] == ["Standard 1", "Standard 2"] + assert sorted(dataset["conforms_to"]) == ["Standard 1", "Standard 2"] + assert sorted(dataset["language"]) == ["ca", "en", "es"] + assert sorted(dataset["theme"]) == [ + "Earth Sciences", + "http://eurovoc.europa.eu/100142", + "http://eurovoc.europa.eu/209065", + ] + assert sorted(dataset["alternate_identifier"]) == [ + "alternate-identifier-1", + "alternate-identifier-2", + ] + assert sorted(dataset["documentation"]) == [ + "http://dataset.info.org/doc1", + "http://dataset.info.org/doc2", + ] # Repeating subfields @@ -226,11 +338,31 @@ def test_e2e_dcat_to_ckan(self): assert dataset["contact"][0]["email"] == "contact@some.org" resource = dataset["resources"][0] + + # Resources: core fields + assert resource["url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + # Resources: standard fields + assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" assert resource["rights"] == "Some statement about rights" + assert resource["issued"] == "2012-05-11" + assert resource["modified"] == "2012-05-01T00:04:06" + assert resource["status"] == "http://purl.org/adms/status/Completed" + assert resource["size"] == 12323 + + # assert resource['hash'] == u'4304cf2e751e6053c90b1804c89c0ebb758f395a' + # assert resource['hash_algorithm'] == u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1' + + assert resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + assert "download_url" not in resource # Resources: list fields assert sorted(resource["language"]) == ["ca", "en", "es"] + assert sorted(resource["documentation"]) == [ + "http://dataset.info.org/distribution1/doc1", + "http://dataset.info.org/distribution1/doc2", + ] + assert sorted(resource["conforms_to"]) == ["Standard 1", "Standard 2"] # Resources: repeating subfields assert resource["access_services"][0]["title"] == "Sparql-end Point" diff --git a/ckanext/dcat/tests/utils.py b/ckanext/dcat/tests/utils.py index c62d9338..8c0e8a18 100644 --- a/ckanext/dcat/tests/utils.py +++ b/ckanext/dcat/tests/utils.py @@ -41,6 +41,9 @@ def _triple(self, graph, subject, predicate, _object, data_type=None): triples = self._triples(graph, subject, predicate, _object, data_type) return triples[0] if triples else None + def _triples_list_values(self, graph, subject, predicate): + return [str(t[2]) for t in graph.triples((subject, predicate, None))] + def _get_typed_list(self, list, datatype): """ returns the list with the given rdf type """ return [datatype(x) for x in list]