diff --git a/bin/python_scripts/remove_duplicates.py b/bin/python_scripts/remove_duplicates.py index 3b3f6d88..ea7f7b90 100644 --- a/bin/python_scripts/remove_duplicates.py +++ b/bin/python_scripts/remove_duplicates.py @@ -76,14 +76,13 @@ def is_local(): # retrieve active package_ids which are matching titles from 2 publishers with more than 100 duplicate datasets -# skip the latest record as that will be the latest updated record retained for publication NOV_2024_PACKAGE_IDS_SQL = "SELECT package_extra.package_id FROM package_extra, harvest_object WHERE " \ "harvest_object.id = value AND key = 'harvest_object_id' AND value IN (" \ "SELECT id FROM harvest_object WHERE id IN (" \ "SELECT value FROM package_extra WHERE key = 'harvest_object_id' AND package_id IN (" \ "SELECT id FROM package WHERE title = '%s' AND state = 'active' AND owner_org IN " \ "('c924c995-e063-4f30-bbd3-61418486f0a9', 'b6b50d70-9d5c-4fef-9135-7756cca343c3')))) " \ - "ORDER BY metadata_modified_date DESC OFFSET 1 ROWS;" + "ORDER BY metadata_modified_date DESC;" def get_duplicate_datasets(sql, token=None): @@ -108,6 +107,18 @@ def delete_dataset(dataset): logger.error('Subprocess Failed, exception occured: %s', exc_info=exception) +def reindex_dataset(dataset): + command = 'ckan search-index rebuild {}'.format( + dataset[0]) + + logger.info('CKAN delete dataset - Running command: %s', command) + + try: + subprocess.call(command, shell=True) + except Exception as exception: + logger.error('Subprocess Failed, exception occured: %s', exc_info=exception) + + def create_csv(rows): with open("deleted_datasets.csv", "w+") as f: for row in rows: @@ -167,7 +178,13 @@ def main(command=None, sql="NOV_2024_TITLES_SQL", subset_sql="NOV_2024_PACKAGE_I counter = 0 for dataset in get_duplicate_datasets(sql): if subset_sql: + reindexed_dataset = False for subset_dataset in get_duplicate_datasets(subset_sql, token=dataset): + # reindex the latest dataset to make it available + if not reindexed_dataset: + reindex_dataset(subset_dataset) + continue + counter += 1 logger.info('%d - %r', counter, f"{dataset}-{subset_dataset}")