-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reuse lobid-transformation for hebis data RPD-225
I reuse the hole transfomration, sofar no adjustments were done for specifc hebis data source. e.g. almaMmsId should be renamed hebisId
- Loading branch information
Showing
38 changed files
with
103,739 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,11 @@ | ||
SRUQUERRY = "http://sru.hebis.de/sru/DB=2.1?query=pica.ort+%3D+%22Mainz%22+and+pica.ppn+%3D+%22524204101%22&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fsru.hebis.de%2Fsru%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=10&startRecord=1&recordPacking=xml&sortKeys=LST_Y%2Cpica%2C0%2C%2C"; | ||
createEndTime = "1"; | ||
|
||
SRUQUERRY | ||
| open-http(accept="application/xml") | ||
| decode-xml | ||
| handle-marcxml | ||
| fix("nothing()") | ||
| fix(FLUX_DIR + "lobid-transformation/marcToLobid.fix",*) | ||
| encode-json(prettyPrinting="true") | ||
; |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
|
||
copy_field("almaMmsId", "describedBy.id") | ||
prepend("describedBy.id", "http://lobid.org/resources/") | ||
|
||
copy_field("almaMmsId", "describedBy.label") | ||
prepend("describedBy.label", "Webseite der hbz-Ressource ") | ||
|
||
set_array("describedBy.type[]", "BibliographicDescription") | ||
|
||
|
||
add_field("describedBy.inDataset.id","http://lobid.org/resources/dataset#!") | ||
|
||
add_field("describedBy.inDataset.label","lobid-resources – Der hbz-Verbundkatalog als Linked Open Data") | ||
|
||
set_array("describedBy.resultOf.type[]", "CreateAction") | ||
|
||
add_field("@createTime","$[createEndTime]") | ||
if all_match("@createTime","0") | ||
add_field("describedBy.resultOf.endTime","0000-00-00T00:00:00") | ||
else | ||
timestamp("describedBy.resultOf.endTime",format:"yyyy-MM-dd'T'HH:mm:ss", timezone:"Europe/Berlin") | ||
end | ||
|
||
|
||
add_field("describedBy.resultOf.instrument.id","https://github.com/hbz/lobid-resources") | ||
|
||
set_array("describedBy.resultOf.instrument.type[]", "SoftwareApplication") | ||
|
||
add_field("describedBy.resultOf.instrument.label","Software lobid-resources") | ||
|
||
copy_field("almaMmsId","describedBy.resultOf.object.id") | ||
prepend("describedBy.resultOf.object.id","https://lobid.org/marcxml/") | ||
|
||
# MNG is a ALMA-specific element | ||
|
||
copy_field("MNG .b","describedBy.resultOf.object.dateCreated") | ||
copy_field("MNG .d","describedBy.resultOf.object.dateModified") | ||
replace_all("describedBy.resultOf.object.dateCreated","-","") | ||
replace_all("describedBy.resultOf.object.dateCreated"," .*","") | ||
replace_all("describedBy.resultOf.object.dateCreated","c|©|\\s?|,|.|:|;|/|=","") | ||
replace_all("describedBy.resultOf.object.dateModified","-","") | ||
replace_all("describedBy.resultOf.object.dateModified"," .*","") | ||
replace_all("describedBy.resultOf.object.dateModified","c|©|\\s?|,|.|:|;|/|=","") | ||
unless any_match("describedBy.resultOf.object.dateCreated","\\d{8}|\\d{4}") | ||
remove_field("describedBy.resultOf.object.dateCreated") | ||
end | ||
unless any_match("describedBy.resultOf.object.dateModified","\\d{8}|\\d{4}") | ||
remove_field("describedBy.resultOf.object.dateModified") | ||
end | ||
replace_all("describedBy.resultOf.object.dateCreated","^(\\d{4})(\\d{2})(\\d{2})$","$1-$2-$3") | ||
replace_all("describedBy.resultOf.object.dateModified","^(\\d{4})(\\d{2})(\\d{2})$","$1-$2-$3") | ||
replace_all("describedBy.resultOf.object.dateCreated","^(\\d{4})$","$1-01-01") | ||
replace_all("describedBy.resultOf.object.dateModified","^(\\d{4})$","$1-01-01") | ||
|
||
set_array("describedBy.resultOf.object.type[]", "DataFeedItem") | ||
|
||
copy_field("almaMmsId","describedBy.resultOf.object.label") | ||
prepend("describedBy.resultOf.object.label","hbz-Ressource ") | ||
append("describedBy.resultOf.object.label"," im Exportformat MARC21 XML") | ||
|
||
add_field("describedBy.resultOf.object.inDataset.id", "https://datahub.io/dataset/hbz_unioncatalog") | ||
|
||
add_field("describedBy.resultOf.object.inDataset.label", "hbz_unioncatalog") | ||
|
||
set_array("describedBy.license[]") | ||
add_field("describedBy.license[].$append.id","http://creativecommons.org/publicdomain/zero/1.0" ) | ||
add_field("describedBy.license[].$last.label","Creative Commons-Lizenz CC0 1.0 Universal" ) | ||
|
||
|
||
# TODO: It seems that there are a lot of organisations that are not in lobid, we should filter them out. | ||
|
||
# 040 - Cataloging Source (NR) - Subfield: $a (NR), $c (NR), $d (R) | ||
# ALMA has a lot of invalid repeated subfields $a | ||
|
||
do list(path: "040 ", "var":"$i") | ||
|
||
do list(path:"$i.a","var":"$j") | ||
unless exists("describedBy.resultOf.object.sourceOrganization.id") | ||
copy_field("$j", "describedBy.resultOf.object.sourceOrganization.id") | ||
end | ||
end | ||
do list(path:"$i.c","var":"$j") | ||
unless exists("describedBy.resultOf.object.provider.id") | ||
copy_field("$j", "describedBy.resultOf.object.provider.id") | ||
end | ||
end | ||
|
||
set_array("describedBy.resultOf.object.modifiedBy[]") | ||
do list(path:"$i.d", "var":"$j") | ||
copy_field("$j", "describedBy.resultOf.object.modifiedBy[].$append.id") | ||
end | ||
|
||
end | ||
|
||
call_macro("provenanceLinks",field: "describedBy.resultOf.object.sourceOrganization.id") | ||
copy_field("describedBy.resultOf.object.sourceOrganization.id","describedBy.resultOf.object.sourceOrganization.label") | ||
lookup("describedBy.resultOf.object.sourceOrganization.label","lobidOrgLabels",delete:"true") | ||
call_macro("provenanceLinks",field: "describedBy.resultOf.object.provider.id") | ||
copy_field("describedBy.resultOf.object.provider.id","describedBy.resultOf.object.provider.label") | ||
lookup("describedBy.resultOf.object.provider.label","lobidOrgLabels",delete:"true") | ||
do list(path:"describedBy.resultOf.object.modifiedBy[]","var":"$i") | ||
call_macro("provenanceLinks",field: "$i.id") | ||
copy_field("$i.id","$i.label") | ||
end | ||
lookup("describedBy.resultOf.object.modifiedBy[].*.label","lobidOrgLabels",delete:"true") | ||
|
||
uniq("describedBy.resultOf.object.modifiedBy[]") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
copy_field("001","almaMmsId") | ||
|
||
paste("id", "~http://lobid.org/resources/", "001", "~#!", join_char: "") | ||
|
||
|
||
# 024 - Other Standard Identifier (R) Subfield: $a (NR) $2 (NR) | ||
# urn | ||
|
||
set_array("urn[]") | ||
|
||
do list(path: "0247?", "var": "$i") | ||
if any_equal("$i.2","urn") | ||
copy_field("$i.a","urn[].$append") | ||
end | ||
end | ||
|
||
# Sometimes urn are not set in 024 then we could pick up the missing from 856. | ||
# 856 - Electronic Location and Access (R) - Subfield: $u (R) $3 (NR) | ||
# 1. Indicator: 4 = HTTP | ||
set_array("@urnLinks") | ||
|
||
do list(path:"856??", "var":"$i") | ||
if all_match("$i.u", "^http.*(urn=|\\.(org|de)/)urn:.+$") # This should ignore repository links like: https://sammlungen.ulb.uni-muenster.de/urn/urn:nbn:de:hbz:6-85659520092 | ||
copy_field("$i.u", "urn[].$append") | ||
copy_field("$i.u", "@urnLinks.$append") | ||
replace_all("urn[].$last", "^http.*[/=](urn:.+$)", "$1") | ||
end | ||
end | ||
|
||
replace_all("urn[].*","^(nbn:de:.*\\d)$","urn:$1") | ||
uniq("@urnLinks") | ||
uniq("urn[]") | ||
|
||
# 035 - System Control Number (R) - Subfield: $a (NR) | ||
|
||
do list(path: "035 ", "var":"$i") | ||
if any_match("$i.a", "\\(DE-605\\)\\D\\D(.*)") | ||
copy_field("$i.a", "hbzId") | ||
end | ||
end | ||
|
||
replace_all("hbzId","\\(DE-605\\)(.*)","$1") | ||
|
||
# add a deprecatedUri to all records with hbzId to document all old lobid urls. | ||
if exists("hbzId") | ||
paste("deprecatedUri", "~http://lobid.org/resources/", "hbzId", "~#!", join_char: "") | ||
end | ||
|
||
|
||
# 020 - International Standard Book Number (R) - $a (NR) | ||
# source data sometimes provides repeated subfield $a even if this is not valid marc | ||
|
||
set_array("@isbn[]") | ||
set_array("isbn[]") | ||
|
||
do list(path:"020 ", "var": "$i") | ||
do list(path:"$i.a", "var": "$j") | ||
copy_field("$j","@isbn[].$append") | ||
end | ||
end | ||
|
||
do list(path:"@isbn[]", "var": "$i") | ||
isbn("$i", to: "clean") | ||
copy_field("$i", "isbn[].$append") | ||
copy_field("$i", "isbn[].$append") | ||
if any_match("$i", ".{13}") | ||
isbn("isbn[].$last", to:"isbn10") | ||
elsif any_match("$i", ".{10}") | ||
isbn("isbn[].$last", to:"isbn13") | ||
end | ||
end | ||
|
||
uniq("isbn[]") | ||
|
||
|
||
# 022 - International Standard Serial Number (R) - Subfield $a (NR) | ||
set_array("issn[]") | ||
do list(path:"022? ", "var":"$i") | ||
copy_field("$i.a", "issn[].$append") | ||
end | ||
replace_all("issn[].*", "-","") | ||
uniq("issn[]") | ||
|
||
# 024 - Other Standard Identifier (R) - Subfield a (NR) 1. Indicator 2 = ISMN | ||
set_array("ismn[]") | ||
do list(path:"0242?", "var":"$i") | ||
copy_field("$i.a", "ismn[].$append") | ||
end | ||
replace_all("ismn[].*", "-","") | ||
|
||
|
||
# 024 (R) Subfield a (NR) 1. Indicator 7 = to defined Identifier | ||
set_array("doi[]") | ||
do list(path:"0247?", "var":"$i") | ||
if all_equal("$i.2","doi") | ||
copy_field("$i.a", "doi[].$append") | ||
end | ||
end | ||
|
||
# Sometimes dois are not set in 024 then we could pick up the missing from 856. | ||
# 856 - Electronic Location and Access (R) - Subfield: $u (R) $3 (NR) | ||
# 1. Indicator: 4 = HTTP | ||
do list(path:"856??", "var":"$i") | ||
if all_match("$i.u", ".*doi.org.*(10\\.(\\d)+/(\\S)+).*") # Volltext | ||
copy_field("$i.u", "doi[].$append") | ||
end | ||
end | ||
replace_all("doi[].*", ".*doi.org.*(10\\.(\\d)+/(\\S)+).*", "$1") | ||
uniq("doi[]") | ||
|
||
# 035 - System Control Number (R) - Subfield: $a (NR) | ||
set_array("oclcNumber[]") | ||
|
||
do list(path:"035 ", "var":"$i") | ||
if all_match("$i.a", "\\(OCoLC\\)(.*)") | ||
copy_field("$i.a", "oclcNumber[].$append") | ||
end | ||
end | ||
replace_all("oclcNumber[].*", "\\(OCoLC\\)","") | ||
|
||
#160 - 016 - National Bibliographic Agency Control Number (R) | ||
do list(path:"0167 ", "var":"$i") | ||
unless exists("zdbId") | ||
if any_match("$i.2","DE-600") | ||
copy_field("$i.a","zdbId") | ||
end | ||
end | ||
|
||
# dnbId | ||
if any_match("$i.2","DE-101") | ||
copy_field("$i.a","dnbId") | ||
end | ||
end | ||
|
||
# 035 - System Control Number (R) - Subfield: $a (NR) | ||
do list(path:"035 ", "var":"$i") | ||
unless exists("zdbId") | ||
if all_match("$i.a", "\\(DE-600\\)(.*)") | ||
copy_field("$i.a", "zdbId") | ||
elsif all_match("$i.a", "\\(DE-599\\)(ZDB.*)") | ||
copy_field("$i.a", "zdbId") | ||
end | ||
end | ||
end | ||
|
||
# clean up ZDB | ||
replace_all("zdbId", "\\(DE-600\\)","") | ||
replace_all("zdbId", "\\(DE-599\\)ZDB","") | ||
replace_all("zdbId", "(\\d{1,7})-* ?-*([Xx\\d])","$1-$2") # CZ entries have incorrect whitespaces sometimes in the zdbId, we need to adjust them so only one "-" separates the first group of numbers from the last number. | ||
|
||
|
||
copy_field("almaMmsId","rpbId") | ||
lookup("rpbId","almaMmsId2rpbId",delete:"true") | ||
replace_all("rpbId", "^RPB","") | ||
|
||
set_array("stockNumber[]") | ||
do list(path:"028??", "var":"$i") | ||
copy_field("$i.a", "stockNumber[].$append") | ||
end | ||
|
||
unless exists("hbzId") | ||
if exists("@inNZ") | ||
copy_field("zdbId","@hbzId") | ||
lookup("@hbzId","zdbId2oldHbzId",delete:"true") | ||
if exists("@hbzId") | ||
paste("deprecatedUri", "~http://lobid.org/resources/", "@hbzId", "~#!", join_char: "") | ||
end | ||
copy_field("@hbzId","hbzId") | ||
end | ||
end |
Oops, something went wrong.