Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve RDM to CLM migration #23 #26

Merged
merged 28 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ab25446
Avoid trying to create list items that dont belong in hierarchy when …
johnatawnclementawn Aug 13, 2024
a834ac0
clarification
johnatawnclementawn Aug 13, 2024
97c31c2
Reduce queries to existing RDM tables, use temp tbl to gather records…
johnatawnclementawn Aug 13, 2024
cc7a569
Improve filter on which tree is being built for concepts in multiple …
johnatawnclementawn Aug 14, 2024
552b9ca
Mint new item and itemvalue ids for concepts that participate in mult…
johnatawnclementawn Aug 14, 2024
1eecf83
Add descriptions for new logic #23
johnatawnclementawn Aug 14, 2024
3e5203b
Check for listitems that already exist in CLM, but participate in col…
johnatawnclementawn Aug 15, 2024
3ed8d95
Add note about apostrophes in collection names
johnatawnclementawn Aug 15, 2024
1cd0ae4
nit #23
johnatawnclementawn Aug 15, 2024
d9486d7
Simplify logic for minting new ids for items and values #23
johnatawnclementawn Aug 15, 2024
2adc1bb
Merge branch 'main' into jmc/23_improve_rdm_to_clm_migration
johnatawnclementawn Aug 19, 2024
bf3b9d4
Merge branch 'main' into jmc/23_improve_rdm_to_clm_migration
johnatawnclementawn Aug 29, 2024
3e2821c
Add note for how to handle apostrophes in collection names on python …
johnatawnclementawn Aug 29, 2024
764df7b
Merge branch 'main' into jmc/23_improve_rdm_to_clm_migration
johnatawnclementawn Sep 11, 2024
5726d1b
nit #23
johnatawnclementawn Sep 11, 2024
f8490d5
Avoid id clashes when migrating collections with the same concepts th…
johnatawnclementawn Sep 11, 2024
0c64d03
Avoid hardcoding prefLabel #23
johnatawnclementawn Sep 11, 2024
950106c
Move tests to use django native test fixtures #23
johnatawnclementawn Sep 11, 2024
34bb961
typo nit #23
johnatawnclementawn Sep 11, 2024
b50924b
nit #23
johnatawnclementawn Sep 12, 2024
eaabd91
Rearange test fixtures #23
johnatawnclementawn Sep 12, 2024
7939a9f
Add more robust tests for RDM to CLM migration #23
johnatawnclementawn Sep 12, 2024
862903e
typo nits #23
johnatawnclementawn Sep 12, 2024
fae5c51
Harden against nonexistent psl options #23
johnatawnclementawn Sep 12, 2024
9fcdd84
Capture all possible value types #23
johnatawnclementawn Sep 12, 2024
b77e2ce
Makes fixtures more accessible
johnatawnclementawn Sep 12, 2024
6c90e60
Add test to ensure psl cmd error is functional #23
johnatawnclementawn Sep 12, 2024
f57a31b
Error nicely if list with same name as collection already exists
jacobtylerwalls Sep 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions arches_references/management/commands/controlled_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ def migrate_collections_to_controlled_lists(
-ho 'http://localhost:8000/plugins/controlled-list-manager/item/'
-psl 'fr'
-ow

for collections that contain an apostrophe, wrap the concept in double quotes, e.g. "John''s list"

jacobtylerwalls marked this conversation as resolved.
Show resolved Hide resolved
"""

collections_in_db = list(
Expand Down
266 changes: 198 additions & 68 deletions arches_references/migrations/0002_etl_collections_to_controlled_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ class Migration(migrations.Migration):
create or replace function __arches_migrate_collections_to_clm(
collection_names text[] default null, -- one or more collections to be migrated to controlled lists
host text default 'http://localhost:8000/plugins/controlled-list-manager/item/',
overwrite boolean default FALSE,
overwrite boolean default FALSE,
preferred_sort_language text default 'en'
)
returns text as $$
declare failed_collections text[];
collection text;
listitems_to_update_with_multiple_values uuid[];
begin
-- RDM Collections to Controlled Lists & List Items Migration --
-- To use, run:
Expand All @@ -29,20 +31,22 @@ class Migration(migrations.Migration):
-- 'en'
-- );
-- where the input array values are concept prefLabels or identifiers and the optional language is used for sorting
-- for collections that contain an apostrophe, use two single quotes, e.g. 'John''s list'

-- Conceptually:
-- a collection becomes a list
-- a concept belonging to a collection becomes a list item
-- a concept at the top of a collection does NOT have a parent list item and should have a depth of 0
-- a concept below the top concepts of the collection will have a parent list item and should have a depth of > 0
-- a prefLabel and any altLabels for a concept become list item values
-- a concept that participates in multiple collections will have distinct list items for each new list it belongs to

-- in the RDM concepts are sorted alphabetically, but are explicitly ordered using a list item's sortorder...
-- in the RDM concepts are sorted alphabetically, but list items are explicitly ordered using sortorder...
-- sort order is calculated at the list level and ordered alphabetically within each leaf of the hierarchy

-- Check if collection_names are provided
if collection_names is null or array_length(collection_names, 1) = 0 then
return 'No collection names or identifiers provided.';
raise exception 'No collection names or identifiers provided.';
end if;

-- Check if input collection names or identifiers exist in the database
Expand Down Expand Up @@ -133,69 +137,200 @@ class Migration(migrations.Migration):

-- The recursive CTE below is used to assign the conceptid of the list at the root to each concept to be migrated
-- On each recursion, it checks if the child (aka conceptidto in relations table) is a parent for another concept
-- All the while, it keeps track of the depth of the child concept, to be used for sorting in the next CTE
with recursive collection_hierarchy as (
select conceptidfrom as root_list,
conceptidto as child,
0 as depth
from relations
where not exists (
select 1 from relations r2 where r2.conceptidto = relations.conceptidfrom
) and relationtype = 'member'
union all
select ch.root_list,
r.conceptidto,
ch.depth + 1
from collection_hierarchy ch
join relations r on ch.child = r.conceptidfrom
where relationtype = 'member'
),
-- Rank prefLabels by user provided language,
-- if no prefLabel in that language exists for a concept, fall back on next prefLabel ordered by languageid
ranked_prefLabels as (
select ch.root_list,
ch.child,
ch.depth,
v.languageid, v.value,
ROW_NUMBER() OVER (PARTITION BY ch.child ORDER BY (v.languageid = preferred_sort_language) DESC, languages.id) AS language_rank,
r.conceptidfrom
from collection_hierarchy ch
left join values v on v.conceptid = ch.child
left join relations r on r.conceptidto = ch.child
left join languages on v.languageid = languages.code
where v.valuetype = 'prefLabel' and
r.relationtype = 'member'
),
-- Once we've assigned our root_list, we want to sort the children (to depth n) alphabetically based on their ranked prefLabel
-- We also want to take INTO account the child's parent value, so the relations table is joined back to capture the parent.
alpha_sorted_list_item_hierarchy as (
select child as id,
row_number() over (partition by root_list order by depth, LOWER(value)) - 1 as sortorder,
root_list as list_id,
case when conceptidfrom = root_list then null -- list items at top of hierarchy have no parent list item
else conceptidfrom
end as parent_id,
depth
from ranked_prefLabels rpl
where language_rank = 1 and
root_list in (select id from arches_references_list where name = ANY(collection_names))
-- All the while, it keeps track of the depth of the child concept, to be used for sorting in the next CTE
-- The results are stored in a temporary table to avoid re-running non-filtered recursion (done on the whole relations table)
-- We keep track of the hierarchy path in order to account for concepts that participate in multiple collections

create temporary table temp_collection_hierarchy as
with recursive collection_hierarchy as (
select conceptidfrom as root_list,
conceptidto as child,
ARRAY[conceptidfrom] AS path,
0 as depth
from relations
where not exists (
select 1 from relations r2 where r2.conceptidto = relations.conceptidfrom
) and relationtype = 'member'
union all
select ch.root_list,
r.conceptidto,
ch.path || r.conceptidfrom,
ch.depth + 1
from collection_hierarchy ch
join relations r on ch.child = r.conceptidfrom
where relationtype = 'member'
)
select * from collection_hierarchy;

-- This temp table is used to stage list items and values
create temporary table temp_list_items_and_values (
list_item_id uuid,
sortorder bigint,
list_id uuid,
parent_id uuid,
legacy_conceptid uuid,
listitemvalue_id uuid,
listitemvalue text,
listitemvalue_languageid text,
listitemvalue_valuetype text,
rownumber int
);

-- Build the new hierarchies at the list level, mainly to account for concepts that participate in multiple collections
-- then stash results in temp table for preprocessing before inserting into CLM tables
foreach collection in array collection_names loop
with filtered_collection_hierarchy as (
select *
from temp_collection_hierarchy
where root_list in (select id from arches_references_list where name = collection)
),
-- Rank prefLabels by user provided language,
-- if no prefLabel in that language exists for a concept, fall back on next prefLabel ordered by languageid
ranked_prefLabels as (
select ch.root_list,
ch.child,
ch.depth,
v.languageid, v.value,
ROW_NUMBER() OVER (PARTITION BY ch.child ORDER BY (v.languageid = preferred_sort_language) DESC, languages.id) AS language_rank,
r.conceptidfrom,
ch.path
from filtered_collection_hierarchy ch
left join values v on v.conceptid = ch.child
left join relations r on r.conceptidto = ch.child
left join languages on v.languageid = languages.code
where v.valuetype = 'prefLabel'
and r.relationtype = 'member'
and r.conceptidfrom in (select unnest(path) from filtered_collection_hierarchy)
),
-- Once we've assigned our root_list, we want to sort the children (to depth n) alphabetically based on their ranked prefLabel
-- We also want to take into account the child's parent value, so the relations table is joined back to capture the parent.
alpha_sorted_list_item_hierarchy as (
select child as id,
row_number() over (partition by root_list order by depth, LOWER(value)) - 1 as sortorder,
root_list as list_id,
case when conceptidfrom = root_list then null -- list items at top of hierarchy have no parent list item
else conceptidfrom
end as parent_id,
depth
from ranked_prefLabels rpl
where language_rank = 1 and
root_list in (select id from arches_references_list where name = collection)
)
insert into temp_list_items_and_values (
list_item_id,
sortorder,
list_id,
parent_id,
legacy_conceptid,
listitemvalue_id,
listitemvalue,
listitemvalue_languageid,
listitemvalue_valuetype
)
select lih.id as list_item_id,
lih.sortorder,
lih.list_id,
lih.parent_id,
lih.id as legacy_conceptid,
v.valueid as listitemvalue_id,
v.value,
v.languageid,
v.valuetype
from alpha_sorted_list_item_hierarchy lih
join values v on v.conceptid = lih.id
where valuetype = 'prefLabel'
or valuetype = 'altLabel'
or valuetype = 'scopeNote'
or valuetype = 'definition'
or valuetype = 'example'
or valuetype = 'historyNote'
or valuetype = 'editorialNote'
or valuetype = 'changeNote'
or valuetype = 'note'
or valuetype = 'description';
end loop;

-- Assign row number to help identify concepts that participate in multiple collections
-- or exist already as listitems and therefore need new listitem_id's and listitemvalue_id's
with assign_row_num as (
select list_item_id,
sortorder,
list_id,
parent_id,
existing_item,
ROW_NUMBER() OVER (PARTITION BY list_item_id ORDER BY existing_item DESC, sortorder ASC) as init_rownumber
from (
select list_item_id,
sortorder,
list_id,
parent_id,
FALSE as existing_item
from temp_list_items_and_values
union all
select id as list_item_id,
sortorder,
list_id,
parent_id,
TRUE as existing_item
from arches_references_listitem
) as t
)
update temp_list_items_and_values t
set rownumber = init_rownumber
from assign_row_num a
where t.list_item_id = a.list_item_id
and t.list_id = a.list_id;

-- For concepts that participate in multiple collections, mint new listitem_id's and listitemvalue_id's
-- However, if a concept needs a new listitem_id, and has multiple values associated with it, ensure that
-- the new listitem_id is the same for all listitemvalues
listitems_to_update_with_multiple_values := array(
select list_item_id
from temp_list_items_and_values
where rownumber > 1
group by list_item_id
having count(*) > 1
);

with new_list_item_ids as (
select legacy_list_item_id,
uuid_generate_v4() as new_list_item_id
from unnest(listitems_to_update_with_multiple_values) as t(legacy_list_item_id)
)
insert into arches_references_listitem(
update temp_list_items_and_values t
set list_item_id = new_list_item_id
from new_list_item_ids n
where t.list_item_id = n.legacy_list_item_id
and rownumber > 1;

-- Update list_item_ids for items that don't have multiple values
update temp_list_items_and_values
set list_item_id = uuid_generate_v4()
where rownumber > 1
and legacy_conceptid != any(listitems_to_update_with_multiple_values)
and list_item_id = legacy_conceptid;

-- Update listitemvalue_ids
update temp_list_items_and_values
set listitemvalue_id = uuid_generate_v4()
where rownumber > 1;

insert into arches_references_listitem (
johnatawnclementawn marked this conversation as resolved.
Show resolved Hide resolved
id,
uri,
sortorder,
guide,
list_id,
parent_id
)
select id,
host || id as uri,
select distinct on (list_item_id, list_id)
list_item_id,
host || legacy_conceptid as uri,
sortorder,
false as guide,
list_id,
parent_id
from alpha_sorted_list_item_hierarchy;

from temp_list_items_and_values;

-- Migrate concept values -> controlled list item values
insert into arches_references_listitemvalue (
Expand All @@ -205,20 +340,15 @@ class Migration(migrations.Migration):
languageid,
valuetype_id
)
select distinct (v.valueid) id,
value,
r.conceptidto as list_item_id,
languageid,
valuetype as valuetype_id
from relations r
full join values v on r.conceptidto = v.conceptid
where relationtype = 'member' and
(valuetype = 'prefLabel' or valuetype = 'altLabel') and
r.conceptidto in (
select id from arches_references_listitem where list_id in (
select id from arches_references_list where name = ANY(collection_names)
)
);
select listitemvalue_id,
listitemvalue,
list_item_id,
listitemvalue_languageid,
listitemvalue_valuetype
from temp_list_items_and_values;

drop table if exists temp_collection_hierarchy;
drop table if exists temp_list_items_and_values;

return format('Collection(s) %s migrated to controlled list(s)', array_to_string(collection_names, ', '));
end;
Expand Down
21 changes: 6 additions & 15 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,34 +66,25 @@ class RDMToControlledListsETLTests(TestCase):
@classmethod
def setUpTestData(cls):

skos = SKOSReader()
rdf = skos.read_file(
os.path.join(PROJECT_TEST_ROOT, "data", "concept_label_test_collection.xml")
)
ret = skos.save_concepts_from_skos(rdf)

client = Client()
client.login(username="admin", password="admin")
response = client.get(
reverse(
"make_collection",
kwargs={"conceptid": "7c90899a-dbe9-4574-9175-e69481a80b3c"},
)
management.call_command(
"loaddata",
"tests/data/polyhierarchical_collections.json",
format="json",
johnatawnclementawn marked this conversation as resolved.
Show resolved Hide resolved
)

def test_migrate_collections_to_controlled_lists(self):
output = io.StringIO()
management.call_command(
"controlled_lists",
operation="migrate_collections_to_controlled_lists",
collections_to_migrate=["Concept Label Import Test"],
collections_to_migrate=["Polyhierarchical Collection Test"],
host="http://localhost:8000/plugins/controlled-list-manager/item/",
preferred_sort_language="en",
johnatawnclementawn marked this conversation as resolved.
Show resolved Hide resolved
overwrite=False,
stdout=output,
)

imported_list = List.objects.get(name="Concept Label Import Test")
imported_list = List.objects.get(name="Polyhierarchical Collection Test")
imported_items = imported_list.list_items.all()
self.assertEqual(len(imported_items), 3)

Expand Down
Loading