Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support foreign key validation #86

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
42 changes: 21 additions & 21 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.7'
python-version: "3.8"
- name: Install requirements
run: pip install flake8 pycodestyle
- name: Check syntax
Expand Down Expand Up @@ -35,7 +35,7 @@ jobs:
POSTGRES_DB: postgres
options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
redis:
image: redis:3
image: redis:3
env:
CKAN_SQLALCHEMY_URL: postgresql://ckan_default:pass@postgres/ckan_test
CKAN_DATASTORE_WRITE_URL: postgresql://datastore_write:pass@postgres/datastore_test
Expand All @@ -44,23 +44,23 @@ jobs:
CKAN_REDIS_URL: redis://redis:6379/1

steps:
- uses: actions/checkout@v2
- name: Install requirements
run: |
pip install -r dev-requirements.txt
pip install -r requirements.txt
pip install --no-warn-conflicts jinja2==2.10.1
pip install --no-warn-conflicts markupsafe==2.0.1
pip install -e .
# Replace default path to CKAN core config file with the one on the container
sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
- name: Setup extension
run: |
ckan -c test.ini db init
- name: Run tests
run: pytest --ckan-ini=test.ini --cov=ckanext.validation --cov-report=xml --cov-append --disable-warnings ckanext/validation/tests -vv
- uses: actions/checkout@v2
- name: Install requirements
run: |
pip install -r dev-requirements.txt
pip install -r requirements.txt
pip install --no-warn-conflicts jinja2==2.10.1
pip install --no-warn-conflicts markupsafe==2.0.1
pip install -e .
# Replace default path to CKAN core config file with the one on the container
sed -i -e 's/use = config:.*/use = config:\/srv\/app\/src\/ckan\/test-core.ini/' test.ini
- name: Setup extension
run: |
ckan -c test.ini db init
- name: Run tests
run: pytest --ckan-ini=test.ini --cov=ckanext.validation --cov-report=xml --cov-append --disable-warnings ckanext/validation/tests -vv

- name: Upload coverage report to codecov
uses: codecov/codecov-action@v1
with:
file: ./coverage.xml
- name: Upload coverage report to codecov
uses: codecov/codecov-action@v1
with:
file: ./coverage.xml
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,30 @@ With the extension enabled and configured, schemas can be attached to the
`schema` field on resources via the UI form or the API. If present in a
resource, they will be used when performing validation on the resource file.

#### Foreign Keys

As per the Frictionless Framework, ckanext-validation can also be used to
validate foreign keys. This is done by adding a `foreignKeys` property to the
schema, with the following format:

```json
{
"foreignKeys": [
{
"fields": "location",
"reference": {
"resource": "locations",
"fields": "code"
}
}
]
}
```

This will validate that the values in the `location` column are present in the
`code` column of the `locations` resource. The `resource` property can be
either the full url of a resource, a valid Frictionless Schema in JSON or the
`resource_type` that matches another resource within the CKAN dataset.

### Validation Options

Expand Down
98 changes: 89 additions & 9 deletions ckanext/validation/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import requests
from sqlalchemy.orm.exc import NoResultFound
from frictionless import validate, system, Report, Schema, Dialect, Check
from frictionless import system, Resource, Package, Report, Schema, Dialect, Check, Checklist, Detector

from ckan.model import Session
import ckan.lib.uploader as uploader
Expand Down Expand Up @@ -64,7 +64,7 @@ def run_validation_job(resource):
# implementation)
pass_auth_header = t.asbool(
t.config.get('ckanext.validation.pass_auth_header', True))
if dataset['private'] and pass_auth_header:
if pass_auth_header:
s = requests.Session()
s.headers.update({
'Authorization': t.config.get(
Expand All @@ -86,7 +86,13 @@ def run_validation_job(resource):
schema = json.loads(schema)

_format = resource['format'].lower()
report = _validate_table(source, _format=_format, schema=schema, **options)

if schema and 'foreignKeys' in schema:
reference_resources = _prepare_foreign_keys(dataset, schema)
else:
reference_resources=[]

report = _validate_table(source, reference_resources=reference_resources, _format=_format, schema=schema, **options)

# Hide uploaded files
if type(report) == Report:
Expand Down Expand Up @@ -136,7 +142,7 @@ def run_validation_job(resource):



def _validate_table(source, _format='csv', schema=None, **options):
def _validate_table(source, _format='csv', schema=None, reference_resources=[], **options):

# This option is needed to allow Frictionless Framework to validate absolute paths
frictionless_context = { 'trusted': True }
Expand All @@ -156,18 +162,92 @@ def _validate_table(source, _format='csv', schema=None, **options):
dialect = Dialect.from_descriptor(options['dialect'])
options['dialect'] = dialect

# Load the list of checks and its parameters declaratively as in https://framework.frictionlessdata.io/docs/checks/table.html
# Load the list of checks and parameters declaratively as in https://framework.frictionlessdata.io/docs/checks/table.html
if 'checks' in options:
checklist = [Check.from_descriptor(c) for c in options['checks']]
options['checks'] = checklist
checklist = Checklist(checks = [Check.from_descriptor(c) for c in options.pop('checks')])
else:
# Note that it's very important to initialise Checklist with NOTHING and not None if there are no checks declared
checklist = Checklist()
if 'pick_errors' in options:
checklist.pick_errors = options.pop('pick_errors', None)
if 'skip_errors' in options:
checklist.skip_errors = options.pop('skip_errors', None)

# remove limit_errors and limit_rows
limit_errors = options.pop('limit_errors', None)
limit_rows = options.pop('limit_rows', None)

with system.use_context(**frictionless_context):
report = validate(source, format=_format, schema=resource_schema, **options)
log.debug('Validating source: %s', source)
# load source as frictionless Resource
if resource_schema:
# with schema
resource = Resource(path=source, format=_format, schema=resource_schema, **options)
else:
# without schema
resource = Resource(path=source, format=_format, **options)

# add resource to a frictionless Package
package = Package(resources=[resource])

# if foreign keys are defined, we need to add the referenced resource(s) to the package
for reference in reference_resources:
referenced_resource = Resource(**reference)
package.add_resource(referenced_resource)

# report = validate(package, pick_errors=pick_errors, skip_errors=skip_errors, limit_errors=limit_errors)
report = package.validate(checklist=checklist, limit_errors=limit_errors, limit_rows=limit_rows)

return report


def _load_if_json(value):
try:
json_object = json.loads(value)
except ValueError as e:
return None
return json_object

def _prepare_foreign_keys(dataset, schema):
referenced_resources = []

for foreign_key in schema.get('foreignKeys', {}):
log.debug(f'Prepping Foreign Key resources: {foreign_key}')

if foreign_key['reference']['resource'] == '':
continue

foreign_key_resource = None
foreign_key_format = 'json'
if foreign_key['reference']['resource'].startswith('http'):
log.debug(f"Foreign Key resource is at url: {foreign_key['reference']['resource']}")

foreign_key_resource = foreign_key['reference']['resource']
if json_object := _load_if_json(foreign_key['reference']['resource']):
log.debug(f'Foreign Key resource is a json object with keys: {json_object.keys()}')

foreign_key_resource = json_object
else:
log.debug('Foreign Key resource is (presumably) a resource in this dataset.')

# get the available resources in this dataset
dataset_resources = [{r.get('resource_type'): {'url':r.get('url'), 'format': r.get('format')}} for r in dataset['resources']]
dataset_resources = {k:v for list_item in dataset_resources for (k,v) in list_item.items()}

# check foreign key resource is in the dataset and get the url
# if it turns out it isn't we will raise an exception
if foreign_key['reference']['resource'] in dataset_resources.keys():
foreign_key_resource = dataset_resources[foreign_key['reference']['resource']]['url']
foreign_key_format = dataset_resources[foreign_key['reference']['resource']]['format'].lower()
else:
raise t.ValidationError(
{'foreignKey': 'Foreign key reference does not exist.' +
'Must be a url, json object or a resource in this dataset.'})

referenced_resources.append({'name': foreign_key['reference']['resource'], 'path': foreign_key_resource, 'format': foreign_key_format})

log.debug('Foreign key resources required: ' + str(referenced_resources))
return referenced_resources

def _get_site_user_api_key():

site_user_name = t.get_action('get_site_user')({'ignore_auth': True}, {})
Expand Down
8 changes: 4 additions & 4 deletions ckanext/validation/tests/test_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class TestInterfaceSync():
@pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_create_sync', True)
@mock.patch('ckanext.validation.jobs.validate',
@mock.patch('frictionless.Package.validate',
return_value=VALID_REPORT)
def test_can_validate_called_on_create_sync(self, mock_validation):

Expand All @@ -73,7 +73,7 @@ def test_can_validate_called_on_create_sync(self, mock_validation):
@pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_create_sync', True)
@mock.patch('ckanext.validation.jobs.validate')
@mock.patch('frictionless.Package.validate')
def test_can_validate_called_on_create_sync_no_validation(self, mock_validation):

dataset = factories.Dataset()
Expand All @@ -91,7 +91,7 @@ def test_can_validate_called_on_create_sync_no_validation(self, mock_validation)
@pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_update_sync', True)
@mock.patch('ckanext.validation.jobs.validate',
@mock.patch('frictionless.Package.validate',
return_value=VALID_REPORT)
def test_can_validate_called_on_update_sync(self, mock_validation):

Expand All @@ -113,7 +113,7 @@ def test_can_validate_called_on_update_sync(self, mock_validation):
@pytest.mark.ckan_config('ckanext.validation.run_on_create_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_update_async', False)
@pytest.mark.ckan_config('ckanext.validation.run_on_update_sync', True)
@mock.patch('ckanext.validation.jobs.validate')
@mock.patch('frictionless.Package.validate')
def test_can_validate_called_on_update_sync_no_validation(self, mock_validation):

dataset = factories.Dataset()
Expand Down
Loading