From 62644883615b985481472bdb94151997c0fd3bf3 Mon Sep 17 00:00:00 2001
From: jbrown-xentity <jbrown@xentity.com>
Date: Fri, 22 Oct 2021 14:03:57 -0600
Subject: [PATCH 1/2] Move datasets to delete first in line

We have reports of datasets that get re-harvested with an extra `1`
in the URL. We have confirmed these reports.
It seems the harvest is doing the best it can to diagnose if
this is a new dataset or not;
but still failing in some circumstances.
This probably won't fix the bug; however it will mitigate it.
By hopefully running through the datasets removal first,
if the spatial harvester is essentially doing a "delete and add"
when it should be replacing, then the name of the new dataset
won't collide with the one that is marked for deletion
but still in the system.
---
 ckanext/spatial/harvesters/waf.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/ckanext/spatial/harvesters/waf.py b/ckanext/spatial/harvesters/waf.py
index 8f657e7f..23621af0 100644
--- a/ckanext/spatial/harvesters/waf.py
+++ b/ckanext/spatial/harvesters/waf.py
@@ -133,6 +133,19 @@ def create_extras(url, date, status):
 
 
         ids = []
+        for location in delete:
+            obj = HarvestObject(job=harvest_job,
+                                extras=create_extras('','', 'delete'),
+                                guid=url_to_ids[location][0],
+                                package_id=url_to_ids[location][1],
+                               )
+            model.Session.query(HarvestObject).\
+                  filter_by(guid=url_to_ids[location][0]).\
+                  update({'current': False}, False)
+
+            obj.save()
+            ids.append(obj.id)
+
         for location in new:
             guid=hashlib.md5(location.encode('utf8','ignore')).hexdigest()
             obj = HarvestObject(job=harvest_job,
@@ -155,19 +168,6 @@ def create_extras(url, date, status):
             obj.save()
             ids.append(obj.id)
 
-        for location in delete:
-            obj = HarvestObject(job=harvest_job,
-                                extras=create_extras('','', 'delete'),
-                                guid=url_to_ids[location][0],
-                                package_id=url_to_ids[location][1],
-                               )
-            model.Session.query(HarvestObject).\
-                  filter_by(guid=url_to_ids[location][0]).\
-                  update({'current': False}, False)
-
-            obj.save()
-            ids.append(obj.id)
-
         if len(ids) > 0:
             log.debug('{0} objects sent to the next stage: {1} new, {2} change, {3} delete'.format(
                 len(ids), len(new), len(change), len(delete)))

From 8bea4dfdc1ad2b605a10ee0f71300271e4165229 Mon Sep 17 00:00:00 2001
From: jbrown-xentity <jbrown@xentity.com>
Date: Thu, 28 Oct 2021 08:01:20 -0600
Subject: [PATCH 2/2] Remove package entirely that are marked for delete

If the harvest is managing the datasets in ckan, it seems that the
harvest source should be the "source of truth".
If this is the case, we shouldn't need "revive" capability of soft
removing packages/datasets in ckan. I propose to actually purge
the dataset within ckan.
Since it's difficult/nearly impossible to track these files without a
unique id, sometimes the harvester will delete and create a new item if
the waf or files change in any way. This would keep that behind the
scenes, and allow the end user to get to the same dataset at the old URL
---
 ckanext/spatial/harvesters/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py
index ccb47cbb..9e5111ea 100644
--- a/ckanext/spatial/harvesters/base.py
+++ b/ckanext/spatial/harvesters/base.py
@@ -467,7 +467,7 @@ def import_stage(self, harvest_object):
             context.update({
                 'ignore_auth': True,
             })
-            p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id})
+            p.toolkit.get_action('package_purge')(context, {'id': harvest_object.package_id})
             log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid))
 
             return True