diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c1b0bf37..62e2d671 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,7 @@ jobs: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: - python-version: '3.7' + python-version: "3.8" - name: Install requirements run: pip install flake8 pycodestyle - name: Check syntax @@ -35,7 +35,7 @@ jobs: POSTGRES_DB: postgres options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 redis: - image: redis:3 + image: redis:3 env: CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test @@ -44,23 +44,23 @@ jobs: CKAN_REDIS_URL: redis://redis:6379/1 steps: - - uses: actions/checkout@v2 - - name: Install requirements - run: | - pip install -r dev-requirements.txt - pip install -r requirements.txt - pip install --no-warn-conflicts jinja2==2.10.1 - pip install --no-warn-conflicts markupsafe==2.0.1 - pip install -e . - # Replace default path to CKAN core config file with the one on the container - sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini - - name: Setup extension - run: | - ckan -c test.ini db init - - name: Run tests - run: pytest --ckan-ini=test.ini --cov=ckanext.validation --cov-report=xml --cov-append --disable-warnings ckanext/validation/tests -vv + - uses: actions/checkout@v2 + - name: Install requirements + run: | + pip install -r dev-requirements.txt + pip install -r requirements.txt + pip install --no-warn-conflicts jinja2==2.10.1 + pip install --no-warn-conflicts markupsafe==2.0.1 + pip install -e . + # Replace default path to CKAN core config file with the one on the container + sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini + - name: Setup extension + run: | + ckan -c test.ini db init + - name: Run tests + run: pytest --ckan-ini=test.ini --cov=ckanext.validation --cov-report=xml --cov-append --disable-warnings ckanext/validation/tests -vv - - name: Upload coverage report to codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v1 + with: + file: ./coverage.xml diff --git a/README.md b/README.md index 921b2702..b8bb3f08 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,30 @@ With the extension enabled and configured, schemas can be attached to the `schema` field on resources via the UI form or the API. If present in a resource, they will be used when performing validation on the resource file. +#### Foreign Keys + +As per the Frictionless Framework, ckanext-validation can also be used to +validate foreign keys. This is done by adding a `foreignKeys` property to the +schema, with the following format: + +```json +{ + "foreignKeys": [ + { + "fields": "location", + "reference": { + "resource": "locations", + "fields": "code" + } + } + ] +} +``` + +This will validate that the values in the `location` column are present in the +`code` column of the `locations` resource. The `resource` property can be +either the full url of a resource, a valid Frictionless Schema in JSON or the +`resource_type` that matches another resource within the CKAN dataset. ### Validation Options diff --git a/ckanext/validation/jobs.py b/ckanext/validation/jobs.py index 23999b4c..a90b287f 100644 --- a/ckanext/validation/jobs.py +++ b/ckanext/validation/jobs.py @@ -7,7 +7,7 @@ import requests from sqlalchemy.orm.exc import NoResultFound -from frictionless import validate, system, Report, Schema, Dialect, Check +from frictionless import system, Resource, Package, Report, Schema, Dialect, Check, Checklist, Detector from ckan.model import Session import ckan.lib.uploader as uploader @@ -64,7 +64,7 @@ def run_validation_job(resource): # implementation) pass_auth_header = t.asbool( t.config.get('ckanext.validation.pass_auth_header', True)) - if dataset['private'] and pass_auth_header: + if pass_auth_header: s = requests.Session() s.headers.update({ 'Authorization': t.config.get( @@ -86,7 +86,13 @@ def run_validation_job(resource): schema = json.loads(schema) _format = resource['format'].lower() - report = _validate_table(source, _format=_format, schema=schema, **options) + + if schema and 'foreignKeys' in schema: + reference_resources = _prepare_foreign_keys(dataset, schema) + else: + reference_resources=[] + + report = _validate_table(source, reference_resources=reference_resources, _format=_format, schema=schema, **options) # Hide uploaded files if type(report) == Report: @@ -136,7 +142,7 @@ def run_validation_job(resource): -def _validate_table(source, _format='csv', schema=None, **options): +def _validate_table(source, _format='csv', schema=None, reference_resources=[], **options): # This option is needed to allow Frictionless Framework to validate absolute paths frictionless_context = { 'trusted': True } @@ -156,18 +162,92 @@ def _validate_table(source, _format='csv', schema=None, **options): dialect = Dialect.from_descriptor(options['dialect']) options['dialect'] = dialect - # Load the list of checks and its parameters declaratively as in https://framework.frictionlessdata.io/docs/checks/table.html + # Load the list of checks and parameters declaratively as in https://framework.frictionlessdata.io/docs/checks/table.html if 'checks' in options: - checklist = [Check.from_descriptor(c) for c in options['checks']] - options['checks'] = checklist + checklist = Checklist(checks = [Check.from_descriptor(c) for c in options.pop('checks')]) + else: + # Note that it's very important to initialise Checklist with NOTHING and not None if there are no checks declared + checklist = Checklist() + if 'pick_errors' in options: + checklist.pick_errors = options.pop('pick_errors', None) + if 'skip_errors' in options: + checklist.skip_errors = options.pop('skip_errors', None) + + # remove limit_errors and limit_rows + limit_errors = options.pop('limit_errors', None) + limit_rows = options.pop('limit_rows', None) with system.use_context(**frictionless_context): - report = validate(source, format=_format, schema=resource_schema, **options) - log.debug('Validating source: %s', source) + # load source as frictionless Resource + if resource_schema: + # with schema + resource = Resource(path=source, format=_format, schema=resource_schema, **options) + else: + # without schema + resource = Resource(path=source, format=_format, **options) + + # add resource to a frictionless Package + package = Package(resources=[resource]) + + # if foreign keys are defined, we need to add the referenced resource(s) to the package + for reference in reference_resources: + referenced_resource = Resource(**reference) + package.add_resource(referenced_resource) + + # report = validate(package, pick_errors=pick_errors, skip_errors=skip_errors, limit_errors=limit_errors) + report = package.validate(checklist=checklist, limit_errors=limit_errors, limit_rows=limit_rows) return report +def _load_if_json(value): + try: + json_object = json.loads(value) + except ValueError as e: + return None + return json_object + +def _prepare_foreign_keys(dataset, schema): + referenced_resources = [] + + for foreign_key in schema.get('foreignKeys', {}): + log.debug(f'Prepping Foreign Key resources: {foreign_key}') + + if foreign_key['reference']['resource'] == '': + continue + + foreign_key_resource = None + foreign_key_format = 'json' + if foreign_key['reference']['resource'].startswith('http'): + log.debug(f"Foreign Key resource is at url: {foreign_key['reference']['resource']}") + + foreign_key_resource = foreign_key['reference']['resource'] + if json_object := _load_if_json(foreign_key['reference']['resource']): + log.debug(f'Foreign Key resource is a json object with keys: {json_object.keys()}') + + foreign_key_resource = json_object + else: + log.debug('Foreign Key resource is (presumably) a resource in this dataset.') + + # get the available resources in this dataset + dataset_resources = [{r.get('resource_type'): {'url':r.get('url'), 'format': r.get('format')}} for r in dataset['resources']] + dataset_resources = {k:v for list_item in dataset_resources for (k,v) in list_item.items()} + + # check foreign key resource is in the dataset and get the url + # if it turns out it isn't we will raise an exception + if foreign_key['reference']['resource'] in dataset_resources.keys(): + foreign_key_resource = dataset_resources[foreign_key['reference']['resource']]['url'] + foreign_key_format = dataset_resources[foreign_key['reference']['resource']]['format'].lower() + else: + raise t.ValidationError( + {'foreignKey': 'Foreign key reference does not exist.' + + 'Must be a url, json object or a resource in this dataset.'}) + + referenced_resources.append({'name': foreign_key['reference']['resource'], 'path': foreign_key_resource, 'format': foreign_key_format}) + + log.debug('Foreign key resources required: ' + str(referenced_resources)) + return referenced_resources + def _get_site_user_api_key(): site_user_name = t.get_action('get_site_user')({'ignore_auth': True}, {}) diff --git a/ckanext/validation/tests/test_interfaces.py b/ckanext/validation/tests/test_interfaces.py index cadca410..a621d0fb 100644 --- a/ckanext/validation/tests/test_interfaces.py +++ b/ckanext/validation/tests/test_interfaces.py @@ -55,7 +55,7 @@ class TestInterfaceSync(): @pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_create_sync', True) - @mock.patch('ckanext.validation.jobs.validate', + @mock.patch('frictionless.Package.validate', return_value=VALID_REPORT) def test_can_validate_called_on_create_sync(self, mock_validation): @@ -73,7 +73,7 @@ def test_can_validate_called_on_create_sync(self, mock_validation): @pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_create_sync', True) - @mock.patch('ckanext.validation.jobs.validate') + @mock.patch('frictionless.Package.validate') def test_can_validate_called_on_create_sync_no_validation(self, mock_validation): dataset = factories.Dataset() @@ -91,7 +91,7 @@ def test_can_validate_called_on_create_sync_no_validation(self, mock_validation) @pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_update_sync', True) - @mock.patch('ckanext.validation.jobs.validate', + @mock.patch('frictionless.Package.validate', return_value=VALID_REPORT) def test_can_validate_called_on_update_sync(self, mock_validation): @@ -113,7 +113,7 @@ def test_can_validate_called_on_update_sync(self, mock_validation): @pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False) @pytest.mark.ckan_config('ckanext.validation.run_on_update_sync', True) - @mock.patch('ckanext.validation.jobs.validate') + @mock.patch('frictionless.Package.validate') def test_can_validate_called_on_update_sync_no_validation(self, mock_validation): dataset = factories.Dataset() diff --git a/ckanext/validation/tests/test_jobs.py b/ckanext/validation/tests/test_jobs.py index 8e770eb0..9d0eba60 100644 --- a/ckanext/validation/tests/test_jobs.py +++ b/ckanext/validation/tests/test_jobs.py @@ -34,10 +34,11 @@ def mock_get_resource_uploader(data_dict): class TestValidationJob(object): @pytest.mark.ckan_config("ckanext.validation.run_on_create_async", False) - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Resource.__create__") @mock.patch.object(Session, "commit") @mock.patch.object(ckantoolkit, "get_action") - def test_job_run_no_schema(self, mock_get_action, mock_commit, mock_validate): + def test_job_run_no_schema(self, mock_get_action, mock_commit, mock_frictionless_resource, mock_validate): org = factories.Organization() dataset = factories.Dataset(private=True, owner_org=org["id"]) @@ -51,14 +52,21 @@ def test_job_run_no_schema(self, mock_get_action, mock_commit, mock_validate): run_validation_job(resource) - assert mock_validate.call_args[0][0] == "http://example.com/file.csv" - assert mock_validate.call_args[1]["format"] == "csv" - assert mock_validate.call_args[1]["schema"] is None - - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + assert mock_frictionless_resource.call_count == 1 + assert mock_frictionless_resource.call_args[1]["path"] == "http://example.com/file.csv" + assert mock_frictionless_resource.call_args[1]["format"] == "csv" + assert "schema" not in mock_frictionless_resource.call_args[1] + + assert mock_validate.call_count == 1 + # assert mock_validate.call_args[1]["checklist"] is {} + assert mock_validate.call_args[1]["limit_errors"] is None + assert mock_validate.call_args[1]["limit_rows"] is None + + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Resource.__create__") @mock.patch.object(Session, "commit") @mock.patch.object(ckantoolkit, "get_action") - def test_job_run_schema(self, mock_get_action, mock_commit, mock_validate): + def test_job_run_schema(self, mock_get_action, mock_commit, mock_frictionless_resource, mock_validate): org = factories.Organization() dataset = factories.Dataset(private=True, owner_org=org["id"]) @@ -79,18 +87,25 @@ def test_job_run_schema(self, mock_get_action, mock_commit, mock_validate): run_validation_job(resource) - assert mock_validate.call_args[0][0] == "http://example.com/file.csv" - assert mock_validate.call_args[1]["format"] == "csv" - assert mock_validate.call_args[1]["schema"].to_dict() == schema - - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + assert mock_frictionless_resource.call_count == 1 + assert mock_frictionless_resource.call_args[1]["path"] == "http://example.com/file.csv" + assert mock_frictionless_resource.call_args[1]["format"] == "csv" + assert mock_frictionless_resource.call_args[1]["schema"].to_dict() == schema + + assert mock_validate.call_count == 1 + # assert mock_validate.call_args[1]["checklist"] is {} + assert mock_validate.call_args[1]["limit_errors"] is None + assert mock_validate.call_args[1]["limit_rows"] is None + + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Resource.__create__") @mock.patch.object( uploader, "get_resource_uploader", return_value=mock_get_resource_uploader({}) ) @mock.patch.object(Session, "commit") @mock.patch.object(ckantoolkit, "get_action") def test_job_run_uploaded_file( - self, mock_get_action, mock_commit, mock_uploader, mock_validate + self, mock_get_action, mock_commit, mock_uploader, mock_frictionless_resource, mock_validate ): org = factories.Organization() @@ -106,11 +121,17 @@ def test_job_run_uploaded_file( run_validation_job(resource) - assert mock_validate.call_args[0][0] == "/tmp/example/{}".format(resource["id"]) - assert mock_validate.call_args[1]["format"] == "csv" - assert mock_validate.call_args[1]["schema"] is None - - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + assert mock_frictionless_resource.call_count == 1 + assert mock_frictionless_resource.call_args[1]["path"] == "/tmp/example/{}".format(resource["id"]) + assert mock_frictionless_resource.call_args[1]["format"] == "csv" + assert "schema" not in mock_frictionless_resource.call_args[1] + + assert mock_validate.call_count == 1 + # assert mock_validate.call_args[1]["checklist"] is {} + assert mock_validate.call_args[1]["limit_errors"] is None + assert mock_validate.call_args[1]["limit_rows"] is None + + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) def test_job_run_valid_stores_validation_object(self, mock_validate): resource = factories.Resource(url="http://example.com/file.csv", format="csv") @@ -127,7 +148,7 @@ def test_job_run_valid_stores_validation_object(self, mock_validate): assert json.loads(validation.report) == VALID_REPORT assert validation.finished - @mock.patch("ckanext.validation.jobs.validate", return_value=INVALID_REPORT) + @mock.patch("frictionless.Package.validate", return_value=INVALID_REPORT) def test_job_run_invalid_stores_validation_object(self, mock_validate): resource = factories.Resource(url="http://example.com/file.csv", format="csv") @@ -144,7 +165,7 @@ def test_job_run_invalid_stores_validation_object(self, mock_validate): assert json.loads(validation.report) == INVALID_REPORT assert validation.finished - @mock.patch("ckanext.validation.jobs.validate", return_value=ERROR_REPORT) + @mock.patch("frictionless.Package.validate", return_value=ERROR_REPORT) def test_job_run_error_stores_validation_object(self, mock_validate): resource = factories.Resource(url="http://example.com/file.csv", format="csv") @@ -162,7 +183,7 @@ def test_job_run_error_stores_validation_object(self, mock_validate): assert validation.finished @mock.patch( - "ckanext.validation.jobs.validate", return_value=VALID_REPORT_LOCAL_FILE + "frictionless.Package.validate", return_value=VALID_REPORT_LOCAL_FILE ) @mock.patch.object( uploader, "get_resource_uploader", return_value=mock_get_resource_uploader({}) @@ -182,7 +203,7 @@ def test_job_run_uploaded_file_replaces_paths(self, mock_uploader, mock_validate report = json.loads(validation.report) assert report["tasks"][0]["place"].startswith("http") - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) def test_job_run_valid_stores_status_in_resource(self, mock_validate): resource = factories.Resource(url="http://example.com/file.csv", format="csv") diff --git a/ckanext/validation/tests/test_logic.py b/ckanext/validation/tests/test_logic.py index 90046f29..875debed 100644 --- a/ckanext/validation/tests/test_logic.py +++ b/ckanext/validation/tests/test_logic.py @@ -347,8 +347,9 @@ def test_run_non_auth_user(self): user = factories.User() org = factories.Organization() dataset = factories.Dataset( - owner_org=org["id"], resources=[factories.Resource()] + owner_org=org["id"] ) + resource = factories.Resource(package_id=dataset['id']) context = {"user": user["name"], "model": model} @@ -357,7 +358,7 @@ def test_run_non_auth_user(self): call_auth, "resource_validation_run", context=context, - resource_id=dataset["resources"][0]["id"], + resource_id=resource["id"], ) def test_run_auth_user(self): @@ -367,8 +368,9 @@ def test_run_auth_user(self): users=[{"name": user["name"], "capacity": "editor"}] ) dataset = factories.Dataset( - owner_org=org["id"], resources=[factories.Resource()] + owner_org=org["id"] ) + resource = factories.Resource(package_id=dataset['id']) context = {"user": user["name"], "model": model} @@ -376,7 +378,7 @@ def test_run_auth_user(self): call_auth( "resource_validation_run", context=context, - resource_id=dataset["resources"][0]["id"], + resource_id=resource["id"], ) is True ) @@ -416,8 +418,9 @@ def test_delete_non_auth_user(self): user = factories.User() org = factories.Organization() dataset = factories.Dataset( - owner_org=org["id"], resources=[factories.Resource()] + owner_org=org["id"] ) + resource = factories.Resource(package_id=dataset['id']) context = {"user": user["name"], "model": model} @@ -426,7 +429,7 @@ def test_delete_non_auth_user(self): call_auth, "resource_validation_delete", context=context, - resource_id=dataset["resources"][0]["id"], + resource_id=resource["id"], ) def test_delete_auth_user(self): @@ -436,8 +439,9 @@ def test_delete_auth_user(self): users=[{"name": user["name"], "capacity": "editor"}] ) dataset = factories.Dataset( - owner_org=org["id"], resources=[factories.Resource()] + owner_org=org["id"] ) + resource = factories.Resource(package_id=dataset['id']) context = {"user": user["name"], "model": model} @@ -445,7 +449,7 @@ def test_delete_auth_user(self): call_auth( "resource_validation_delete", context=context, - resource_id=dataset["resources"][0]["id"], + resource_id=resource["id"], ) is True ) @@ -468,8 +472,9 @@ def test_show_anon_public_dataset(self): user = factories.User() org = factories.Organization() dataset = factories.Dataset( - owner_org=org["id"], resources=[factories.Resource()], private=False + owner_org=org["id"], private=False ) + resource = factories.Resource(package_id=dataset['id']) context = {"user": user["name"], "model": model} @@ -477,7 +482,7 @@ def test_show_anon_public_dataset(self): call_auth( "resource_validation_show", context=context, - resource_id=dataset["resources"][0]["id"], + resource_id=resource["id"], ) is True ) @@ -487,8 +492,9 @@ def test_show_anon_private_dataset(self): user = factories.User() org = factories.Organization() dataset = factories.Dataset( - owner_org=org["id"], resources=[factories.Resource()], private=True + owner_org=org["id"] ) + resource = factories.Resource(package_id=dataset['id']) context = {"user": user["name"], "model": model} @@ -497,7 +503,7 @@ def test_show_anon_private_dataset(self): call_auth, "resource_validation_run", context=context, - resource_id=dataset["resources"][0]["id"], + resource_id=resource["id"], ) @@ -575,7 +581,7 @@ def test_validation_passes_on_upload(self): assert resource["validation_status"] == "success" assert "validation_timestamp" in resource - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) def test_validation_passes_with_url(self, mock_validate): url = "https://example.com/valid.csv" @@ -672,7 +678,7 @@ def test_validation_passes_on_upload(self): assert resource["validation_status"] == "success" assert "validation_timestamp" in resource - @mock.patch("ckanext.validation.jobs.validate", return_value=VALID_REPORT) + @mock.patch("frictionless.Package.validate", return_value=VALID_REPORT) def test_validation_passes_with_url(self, mock_validate): dataset = factories.Dataset(resources=[{"url": "https://example.com/data.csv"}]) diff --git a/requirements.txt b/requirements.txt index ee7cb38c..2e245e01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ ckantoolkit>=0.0.3 -frictionless==5.0.0b9 +frictionless[ckan]==5.0.0b9 markupsafe==2.0.1 tableschema -e git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming diff --git a/test.ini b/test.ini index baf45473..89b19822 100644 --- a/test.ini +++ b/test.ini @@ -9,7 +9,7 @@ host = 0.0.0.0 port = 5000 [app:main] -use = config:../../src/ckan/test-core.ini +use = config:../ckan/test-core.ini # Insert any custom config settings to be used when running your extension's # tests here.