From 62644883615b985481472bdb94151997c0fd3bf3 Mon Sep 17 00:00:00 2001 From: jbrown-xentity Date: Fri, 22 Oct 2021 14:03:57 -0600 Subject: [PATCH 1/2] Move datasets to delete first in line We have reports of datasets that get re-harvested with an extra `1` in the URL. We have confirmed these reports. It seems the harvest is doing the best it can to diagnose if this is a new dataset or not; but still failing in some circumstances. This probably won't fix the bug; however it will mitigate it. By hopefully running through the datasets removal first, if the spatial harvester is essentially doing a "delete and add" when it should be replacing, then the name of the new dataset won't collide with the one that is marked for deletion but still in the system. --- ckanext/spatial/harvesters/waf.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ckanext/spatial/harvesters/waf.py b/ckanext/spatial/harvesters/waf.py index 8f657e7f..23621af0 100644 --- a/ckanext/spatial/harvesters/waf.py +++ b/ckanext/spatial/harvesters/waf.py @@ -133,6 +133,19 @@ def create_extras(url, date, status): ids = [] + for location in delete: + obj = HarvestObject(job=harvest_job, + extras=create_extras('','', 'delete'), + guid=url_to_ids[location][0], + package_id=url_to_ids[location][1], + ) + model.Session.query(HarvestObject).\ + filter_by(guid=url_to_ids[location][0]).\ + update({'current': False}, False) + + obj.save() + ids.append(obj.id) + for location in new: guid=hashlib.md5(location.encode('utf8','ignore')).hexdigest() obj = HarvestObject(job=harvest_job, @@ -155,19 +168,6 @@ def create_extras(url, date, status): obj.save() ids.append(obj.id) - for location in delete: - obj = HarvestObject(job=harvest_job, - extras=create_extras('','', 'delete'), - guid=url_to_ids[location][0], - package_id=url_to_ids[location][1], - ) - model.Session.query(HarvestObject).\ - filter_by(guid=url_to_ids[location][0]).\ - update({'current': False}, False) - - obj.save() - ids.append(obj.id) - if len(ids) > 0: log.debug('{0} objects sent to the next stage: {1} new, {2} change, {3} delete'.format( len(ids), len(new), len(change), len(delete))) From 8bea4dfdc1ad2b605a10ee0f71300271e4165229 Mon Sep 17 00:00:00 2001 From: jbrown-xentity Date: Thu, 28 Oct 2021 08:01:20 -0600 Subject: [PATCH 2/2] Remove package entirely that are marked for delete If the harvest is managing the datasets in ckan, it seems that the harvest source should be the "source of truth". If this is the case, we shouldn't need "revive" capability of soft removing packages/datasets in ckan. I propose to actually purge the dataset within ckan. Since it's difficult/nearly impossible to track these files without a unique id, sometimes the harvester will delete and create a new item if the waf or files change in any way. This would keep that behind the scenes, and allow the end user to get to the same dataset at the old URL --- ckanext/spatial/harvesters/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index ccb47cbb..9e5111ea 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -467,7 +467,7 @@ def import_stage(self, harvest_object): context.update({ 'ignore_auth': True, }) - p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id}) + p.toolkit.get_action('package_purge')(context, {'id': harvest_object.package_id}) log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid)) return True