Skip to content

Commit

Permalink
Reuse lobid-transformation for hebis data RPD-225
Browse files Browse the repository at this point in the history
I reuse the hole transfomration, sofar no adjustments were done for specifc hebis data source. e.g. almaMmsId should be renamed hebisId
  • Loading branch information
TobiasNx committed Dec 2, 2024
1 parent 3153738 commit 39d4908
Show file tree
Hide file tree
Showing 38 changed files with 103,739 additions and 1 deletion.
3 changes: 2 additions & 1 deletion conf/fetchAndTransformHebisRecord.flux
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
SRUQUERRY = "http://sru.hebis.de/sru/DB=2.1?query=pica.ort+%3D+%22Mainz%22+and+pica.ppn+%3D+%22524204101%22&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fsru.hebis.de%2Fsru%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=10&startRecord=1&recordPacking=xml&sortKeys=LST_Y%2Cpica%2C0%2C%2C";
createEndTime = "1";

SRUQUERRY
| open-http(accept="application/xml")
| decode-xml
| handle-marcxml
| fix("nothing()")
| fix(FLUX_DIR + "lobid-transformation/marcToLobid.fix",*)
| encode-json(prettyPrinting="true")
| print
;
388 changes: 388 additions & 0 deletions conf/lobid-transformation/fix/contribution.fix

Large diffs are not rendered by default.

107 changes: 107 additions & 0 deletions conf/lobid-transformation/fix/describedBy.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@

copy_field("almaMmsId", "describedBy.id")
prepend("describedBy.id", "http://lobid.org/resources/")

copy_field("almaMmsId", "describedBy.label")
prepend("describedBy.label", "Webseite der hbz-Ressource ")

set_array("describedBy.type[]", "BibliographicDescription")


add_field("describedBy.inDataset.id","http://lobid.org/resources/dataset#!")

add_field("describedBy.inDataset.label","lobid-resources – Der hbz-Verbundkatalog als Linked Open Data")

set_array("describedBy.resultOf.type[]", "CreateAction")

add_field("@createTime","$[createEndTime]")
if all_match("@createTime","0")
add_field("describedBy.resultOf.endTime","0000-00-00T00:00:00")
else
timestamp("describedBy.resultOf.endTime",format:"yyyy-MM-dd'T'HH:mm:ss", timezone:"Europe/Berlin")
end


add_field("describedBy.resultOf.instrument.id","https://github.com/hbz/lobid-resources")

set_array("describedBy.resultOf.instrument.type[]", "SoftwareApplication")

add_field("describedBy.resultOf.instrument.label","Software lobid-resources")

copy_field("almaMmsId","describedBy.resultOf.object.id")
prepend("describedBy.resultOf.object.id","https://lobid.org/marcxml/")

# MNG is a ALMA-specific element

copy_field("MNG .b","describedBy.resultOf.object.dateCreated")
copy_field("MNG .d","describedBy.resultOf.object.dateModified")
replace_all("describedBy.resultOf.object.dateCreated","-","")
replace_all("describedBy.resultOf.object.dateCreated"," .*","")
replace_all("describedBy.resultOf.object.dateCreated","c|©|\\s?|,|.|:|;|/|=","")
replace_all("describedBy.resultOf.object.dateModified","-","")
replace_all("describedBy.resultOf.object.dateModified"," .*","")
replace_all("describedBy.resultOf.object.dateModified","c|©|\\s?|,|.|:|;|/|=","")
unless any_match("describedBy.resultOf.object.dateCreated","\\d{8}|\\d{4}")
remove_field("describedBy.resultOf.object.dateCreated")
end
unless any_match("describedBy.resultOf.object.dateModified","\\d{8}|\\d{4}")
remove_field("describedBy.resultOf.object.dateModified")
end
replace_all("describedBy.resultOf.object.dateCreated","^(\\d{4})(\\d{2})(\\d{2})$","$1-$2-$3")
replace_all("describedBy.resultOf.object.dateModified","^(\\d{4})(\\d{2})(\\d{2})$","$1-$2-$3")
replace_all("describedBy.resultOf.object.dateCreated","^(\\d{4})$","$1-01-01")
replace_all("describedBy.resultOf.object.dateModified","^(\\d{4})$","$1-01-01")

set_array("describedBy.resultOf.object.type[]", "DataFeedItem")

copy_field("almaMmsId","describedBy.resultOf.object.label")
prepend("describedBy.resultOf.object.label","hbz-Ressource ")
append("describedBy.resultOf.object.label"," im Exportformat MARC21 XML")

add_field("describedBy.resultOf.object.inDataset.id", "https://datahub.io/dataset/hbz_unioncatalog")

add_field("describedBy.resultOf.object.inDataset.label", "hbz_unioncatalog")

set_array("describedBy.license[]")
add_field("describedBy.license[].$append.id","http://creativecommons.org/publicdomain/zero/1.0" )
add_field("describedBy.license[].$last.label","Creative Commons-Lizenz CC0 1.0 Universal" )


# TODO: It seems that there are a lot of organisations that are not in lobid, we should filter them out.

# 040 - Cataloging Source (NR) - Subfield: $a (NR), $c (NR), $d (R)
# ALMA has a lot of invalid repeated subfields $a

do list(path: "040 ", "var":"$i")

do list(path:"$i.a","var":"$j")
unless exists("describedBy.resultOf.object.sourceOrganization.id")
copy_field("$j", "describedBy.resultOf.object.sourceOrganization.id")
end
end
do list(path:"$i.c","var":"$j")
unless exists("describedBy.resultOf.object.provider.id")
copy_field("$j", "describedBy.resultOf.object.provider.id")
end
end

set_array("describedBy.resultOf.object.modifiedBy[]")
do list(path:"$i.d", "var":"$j")
copy_field("$j", "describedBy.resultOf.object.modifiedBy[].$append.id")
end

end

call_macro("provenanceLinks",field: "describedBy.resultOf.object.sourceOrganization.id")
copy_field("describedBy.resultOf.object.sourceOrganization.id","describedBy.resultOf.object.sourceOrganization.label")
lookup("describedBy.resultOf.object.sourceOrganization.label","lobidOrgLabels",delete:"true")
call_macro("provenanceLinks",field: "describedBy.resultOf.object.provider.id")
copy_field("describedBy.resultOf.object.provider.id","describedBy.resultOf.object.provider.label")
lookup("describedBy.resultOf.object.provider.label","lobidOrgLabels",delete:"true")
do list(path:"describedBy.resultOf.object.modifiedBy[]","var":"$i")
call_macro("provenanceLinks",field: "$i.id")
copy_field("$i.id","$i.label")
end
lookup("describedBy.resultOf.object.modifiedBy[].*.label","lobidOrgLabels",delete:"true")

uniq("describedBy.resultOf.object.modifiedBy[]")
170 changes: 170 additions & 0 deletions conf/lobid-transformation/fix/identifiers.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
copy_field("001","almaMmsId")

paste("id", "~http://lobid.org/resources/", "001", "~#!", join_char: "")


# 024 - Other Standard Identifier (R) Subfield: $a (NR) $2 (NR)
# urn

set_array("urn[]")

do list(path: "0247?", "var": "$i")
if any_equal("$i.2","urn")
copy_field("$i.a","urn[].$append")
end
end

# Sometimes urn are not set in 024 then we could pick up the missing from 856.
# 856 - Electronic Location and Access (R) - Subfield: $u (R) $3 (NR)
# 1. Indicator: 4 = HTTP
set_array("@urnLinks")

do list(path:"856??", "var":"$i")
if all_match("$i.u", "^http.*(urn=|\\.(org|de)/)urn:.+$") # This should ignore repository links like: https://sammlungen.ulb.uni-muenster.de/urn/urn:nbn:de:hbz:6-85659520092
copy_field("$i.u", "urn[].$append")
copy_field("$i.u", "@urnLinks.$append")
replace_all("urn[].$last", "^http.*[/=](urn:.+$)", "$1")
end
end

replace_all("urn[].*","^(nbn:de:.*\\d)$","urn:$1")
uniq("@urnLinks")
uniq("urn[]")

# 035 - System Control Number (R) - Subfield: $a (NR)

do list(path: "035 ", "var":"$i")
if any_match("$i.a", "\\(DE-605\\)\\D\\D(.*)")
copy_field("$i.a", "hbzId")
end
end

replace_all("hbzId","\\(DE-605\\)(.*)","$1")

# add a deprecatedUri to all records with hbzId to document all old lobid urls.
if exists("hbzId")
paste("deprecatedUri", "~http://lobid.org/resources/", "hbzId", "~#!", join_char: "")
end


# 020 - International Standard Book Number (R) - $a (NR)
# source data sometimes provides repeated subfield $a even if this is not valid marc

set_array("@isbn[]")
set_array("isbn[]")

do list(path:"020 ", "var": "$i")
do list(path:"$i.a", "var": "$j")
copy_field("$j","@isbn[].$append")
end
end

do list(path:"@isbn[]", "var": "$i")
isbn("$i", to: "clean")
copy_field("$i", "isbn[].$append")
copy_field("$i", "isbn[].$append")
if any_match("$i", ".{13}")
isbn("isbn[].$last", to:"isbn10")
elsif any_match("$i", ".{10}")
isbn("isbn[].$last", to:"isbn13")
end
end

uniq("isbn[]")


# 022 - International Standard Serial Number (R) - Subfield $a (NR)
set_array("issn[]")
do list(path:"022? ", "var":"$i")
copy_field("$i.a", "issn[].$append")
end
replace_all("issn[].*", "-","")
uniq("issn[]")

# 024 - Other Standard Identifier (R) - Subfield a (NR) 1. Indicator 2 = ISMN
set_array("ismn[]")
do list(path:"0242?", "var":"$i")
copy_field("$i.a", "ismn[].$append")
end
replace_all("ismn[].*", "-","")


# 024 (R) Subfield a (NR) 1. Indicator 7 = to defined Identifier
set_array("doi[]")
do list(path:"0247?", "var":"$i")
if all_equal("$i.2","doi")
copy_field("$i.a", "doi[].$append")
end
end

# Sometimes dois are not set in 024 then we could pick up the missing from 856.
# 856 - Electronic Location and Access (R) - Subfield: $u (R) $3 (NR)
# 1. Indicator: 4 = HTTP
do list(path:"856??", "var":"$i")
if all_match("$i.u", ".*doi.org.*(10\\.(\\d)+/(\\S)+).*") # Volltext
copy_field("$i.u", "doi[].$append")
end
end
replace_all("doi[].*", ".*doi.org.*(10\\.(\\d)+/(\\S)+).*", "$1")
uniq("doi[]")

# 035 - System Control Number (R) - Subfield: $a (NR)
set_array("oclcNumber[]")

do list(path:"035 ", "var":"$i")
if all_match("$i.a", "\\(OCoLC\\)(.*)")
copy_field("$i.a", "oclcNumber[].$append")
end
end
replace_all("oclcNumber[].*", "\\(OCoLC\\)","")

#160 - 016 - National Bibliographic Agency Control Number (R)
do list(path:"0167 ", "var":"$i")
unless exists("zdbId")
if any_match("$i.2","DE-600")
copy_field("$i.a","zdbId")
end
end

# dnbId
if any_match("$i.2","DE-101")
copy_field("$i.a","dnbId")
end
end

# 035 - System Control Number (R) - Subfield: $a (NR)
do list(path:"035 ", "var":"$i")
unless exists("zdbId")
if all_match("$i.a", "\\(DE-600\\)(.*)")
copy_field("$i.a", "zdbId")
elsif all_match("$i.a", "\\(DE-599\\)(ZDB.*)")
copy_field("$i.a", "zdbId")
end
end
end

# clean up ZDB
replace_all("zdbId", "\\(DE-600\\)","")
replace_all("zdbId", "\\(DE-599\\)ZDB","")
replace_all("zdbId", "(\\d{1,7})-* ?-*([Xx\\d])","$1-$2") # CZ entries have incorrect whitespaces sometimes in the zdbId, we need to adjust them so only one "-" separates the first group of numbers from the last number.


copy_field("almaMmsId","rpbId")
lookup("rpbId","almaMmsId2rpbId",delete:"true")
replace_all("rpbId", "^RPB","")

set_array("stockNumber[]")
do list(path:"028??", "var":"$i")
copy_field("$i.a", "stockNumber[].$append")
end

unless exists("hbzId")
if exists("@inNZ")
copy_field("zdbId","@hbzId")
lookup("@hbzId","zdbId2oldHbzId",delete:"true")
if exists("@hbzId")
paste("deprecatedUri", "~http://lobid.org/resources/", "@hbzId", "~#!", join_char: "")
end
copy_field("@hbzId","hbzId")
end
end
Loading

0 comments on commit 39d4908

Please sign in to comment.