From 74c69d3377d2fc602f8f7d445262c404f49995cd Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Mon, 30 Oct 2023 13:48:14 +0100 Subject: [PATCH] Tweak rppd-to-lobid transformation for lobid-gnd usage (RPB-102) - Prefer GND IDs and namespace for `id` field - Add `type` field with hard-coded values - Set filenames and indexing settings --- conf/rppd-to-lobid.fix | 24 +++++++++++++++--------- conf/rppd-to-lobid.flux | 6 +++--- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/conf/rppd-to-lobid.fix b/conf/rppd-to-lobid.fix index b2e3dbcb..0af63f78 100644 --- a/conf/rppd-to-lobid.fix +++ b/conf/rppd-to-lobid.fix @@ -4,10 +4,21 @@ nothing() #00 RPPD-ID # Komentar Doku: (Achtung: 00 BLANK) -copy_field("f00_","rppdId") copy_field("f00_","id") -prepend("id","https://lbz.rlp.de/rppd/") +copy_field("f00_","rppdId") +prepend("id","https://rppd.lobid.org/") + +# ------- +#82b (GND-ID (R)) -> gndIdentifier +# Kommentar Doku: ohne das vorangestellte Präfix (DE-588) +copy_field("f82b", "gndIdentifier") +unless all_contain("f82b", "Keine GND-Ansetzung") + copy_field("f82b","id") + prepend("id","https://d-nb.info/gnd/") +end + +set_array("type[]", "AuthorityResource", "Person", "DifferentiatedPerson") # #1na (Name, bevorzugte Form) -> preferredName # Name ist aber Kombination aus Geburtsdaten und Name e.g. "f1na": "Marquard, Udo / 1959-" @@ -117,12 +128,6 @@ end replace_all("f1ny", "(\\d{4})(\\d{2})(\\d{2})", "$1-$2-$3") copy_field("f1ny", "describedBy.dateModified") -# ------- -#82b (GND-ID (R)) -> gndIdentifier -# Kommentar Doku: ohne das vorangestellte Präfix (DE-588) - -copy_field("f82b", "gndIdentifier") - # ------- #1z1 (1. biogr. Anmerkung) -> biographicalOrHistoricalInformation # Kommentar Doku: getrennt durch "; ". Keine Abkürzungen benutzen, da die Stichworte in Register 9 indexiert werden. Zitate aus Quellen in Anführungszeichen; bei sehr langen, biogr. Anmerkungen wird der Text auf mehrere Kategorien aufgeteilt: #1z2, #1z3, #1z4 ... #1z9. Bei eingespielten Biographien werden die Angaben zum Originalwerk am Ende angegeben: --- [Daten übernommen aus: ....] @@ -130,7 +135,8 @@ copy_field("f82b", "gndIdentifier") vacuum() retain( "rppdId", - "id", + "id", + "type[]", "preferredName", "variantName[]", "dateOfBirth[]", diff --git a/conf/rppd-to-lobid.flux b/conf/rppd-to-lobid.flux index e12c7d81..fce6039e 100644 --- a/conf/rppd-to-lobid.flux +++ b/conf/rppd-to-lobid.flux @@ -1,11 +1,11 @@ -default outfile = "conf/output/bulk/bulk-rppd-${i}.ndjson"; -"conf/output/test-output-rppd.json" +default outfile = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix +"conf/output/output-rppd-strapi.ndjson" | open-file | as-lines | decode-json | fix(FLUX_DIR + "rppd-to-lobid.fix") | batch-reset(batchsize="1000") | encode-json(prettyPrinting="false") -| json-to-elasticsearch-bulk(idkey="id", type="resource", index="resources-alma-fix-staging") +| json-to-elasticsearch-bulk(idkey="id", type="authority", index="gnd-rppd-test") | write(outfile) ;