Skip to content

Commit

Permalink
Add extract xml test records
Browse files Browse the repository at this point in the history
This gets single records out of a big xml file. It is used to easily
update the test marc xml sources.

- add install metafacture-core master SNAPSHOT locally
- use 'null' namespace in MarcXmlHandler
- update pom.xml
- add single xml tar file consisting of the test xml records
  • Loading branch information
dr0i committed Jan 5, 2021
1 parent 3761b40 commit 23c0045
Show file tree
Hide file tree
Showing 11 changed files with 108 additions and 41 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ dist: trusty
env:
- ACTIVATOR_VERSION=1.3.10
install:
- bash install-dependencies.sh
- mvn clean install -e -q --settings settings.xml
before_script:
- cd web
Expand Down
6 changes: 6 additions & 0 deletions install-dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
mkdir metafacture-core
git clone https://github.com/metafacture/metafacture-core.git
cd metafacture-core
./gradlew install
cd ..
9 changes: 7 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,25 @@
<slf4j.version>1.7.25</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-strings</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-io</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-json</artifactId>
<version>5.1.0</version>
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
<artifactId>metafacture-biblio</artifactId>
<version>5.1.0</version>
<version>master-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.metafacture</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,17 @@
import org.metafacture.io.FileOpener;
import org.metafacture.io.ObjectStdoutWriter;
import org.metafacture.io.ObjectWriter;
import org.metafacture.io.TarReader;
import org.metafacture.json.JsonEncoder;
import org.metafacture.mangling.LiteralToObject;
import org.metafacture.metamorph.Metamorph;
import org.metafacture.monitoring.StreamBatchLogger;
import org.metafacture.strings.StringFilter;
import org.metafacture.strings.StringReader;
import org.metafacture.xml.SimpleXmlEncoder;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.xml.XmlElementSplitter;
import org.metafacture.xml.XmlFilenameWriter;

/**
* Test transformations of Alma MARC21 XML catalog data into lobid JSON-LD.
Expand All @@ -37,13 +45,17 @@ public final class AlmaMarc21XmlToLobidJsonTest {

private static final String MORPH = "src/main/resources/alma/alma.xml";
private static final File DIRECTORY = new File("src/test/resources/alma/");
private static final String BIG_ALMA_XML_FILE =
DIRECTORY + "/HT012734833_etAl.xml.tar.bz2";
private static final String XML = "xml";
final HashMap<String, String> morphVariables = new HashMap<>();
private static boolean GENERATE_TESTDATA =
private static boolean GENERATE_TESTDATA =
System.getProperty("generateTestData", "false").equals("true");
private static final PrintStream ORIG_OUT = System.out;
private static final Logger LOG =
LogManager.getLogger(AlmaMarc21XmlToLobidJsonTest.class);
private static final String PATTERN_TO_IDENTIFY_XML_RECORDS =
"HT005207972|HT012734833|KUR00770801";

/**
* Sets necessary morph variables.
Expand All @@ -53,11 +65,60 @@ public void setup() {
morphVariables.put("isil", "DE-632");
morphVariables.put("member", "DE-605");
morphVariables.put("catalogid", "DE-605");
GENERATE_TESTDATA = true;
if (GENERATE_TESTDATA) {
extractXmlTestRecords(PATTERN_TO_IDENTIFY_XML_RECORDS);
}
}

/**
* Splits xml and extracts records hit by a pattern. Needs 50 secs for 100.000
* resources in a 44_MB_XML.tar.gz. It's 100 times faster than Filter(morph).
* This method helps to update the Marc-Xml test files by identifying the
* records, determining the name of the file using an xpath to get the value
* from `035 .a` and writes this into the test directory.
*
* The files are not pretty printed but untouched, though.
*
* @param pattern the pattern which is searched for to identify xml records
*/
public static void extractXmlTestRecords(final String pattern) {
long startTime = System.currentTimeMillis();
XmlElementSplitter xmlElementSplitter = new XmlElementSplitter();
xmlElementSplitter.setElementName("record");
XmlElementSplitter xmlElementSplitter_1 = new XmlElementSplitter();
xmlElementSplitter_1.setElementName("record");
final StringFilter stringFilter = new StringFilter(pattern);
XmlFilenameWriter xmlFilenameWriter = new XmlFilenameWriter();
xmlFilenameWriter
.setProperty("/record/datafield[@tag='035']/subfield[@code='a']");
xmlFilenameWriter.setTarget("src/test/resources/alma/");
StreamBatchLogger logger = new StreamBatchLogger();
logger.setBatchSize(10);
FileOpener opener = new FileOpener();
SimpleXmlEncoder simpleXmlEncoder = new SimpleXmlEncoder();
simpleXmlEncoder.setSeparateRoots(true);
opener.setReceiver(new TarReader()) //
.setReceiver(new XmlDecoder()) //
.setReceiver(xmlElementSplitter) //
.setReceiver(logger) //
.setReceiver(new LiteralToObject()) //
.setReceiver(stringFilter)
.setReceiver(new StringReader()) //
.setReceiver(new XmlDecoder()) //
.setReceiver(xmlElementSplitter_1) //
.setReceiver(xmlFilenameWriter);

opener.process(BIG_ALMA_XML_FILE);
opener.closeStream();
long endTime = System.currentTimeMillis();
LOG.info("Time needed:" + (endTime - startTime) / 1000);
}

/**
* Cleans a bit up. Sets the System.out to the original PrintStream.
*/
@SuppressWarnings("static-method")
@After
public void cleanup() {
System.setOut(ORIG_OUT);
Expand All @@ -74,6 +135,8 @@ public void cleanup() {
public void transformFiles() {
Arrays.asList(DIRECTORY.listFiles(f -> f.getAbsolutePath().endsWith(XML)))
.forEach(file -> {
MarcXmlHandler marcXmlHandler = new MarcXmlHandler();
marcXmlHandler.setNamespace(null);
JsonEncoder jsonEncoder = new JsonEncoder();
jsonEncoder.setPrettyPrinting(true);
ObjectMapper mapper = new ObjectMapper();
Expand All @@ -82,7 +145,7 @@ public void transformFiles() {
try {
FileOpener opener = new FileOpener();
opener.setReceiver(new XmlDecoder())
.setReceiver(new MarcXmlHandler())
.setReceiver(marcXmlHandler)
.setReceiver(new Metamorph(MORPH, morphVariables))
.setReceiver(jsonEncoder);

Expand All @@ -97,7 +160,8 @@ public void transformFiles() {
opener.process(file.getAbsolutePath());
opener.closeStream();
if (!GENERATE_TESTDATA) {
JsonNode expectedJsonNode = mapper.readTree(new File(filenameJson));
JsonNode expectedJsonNode =
mapper.readTree(new File(filenameJson));
Object expectedJsonObject =
mapper.readValue(expectedJsonNode.toString(), Object.class);
String expectedJson = mapper.writerWithDefaultPrettyPrinter()
Expand Down
2 changes: 1 addition & 1 deletion src/test/resources/alma/HT005207972.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
},
"resultOf" : {
"type" : [ "CreateAction" ],
"endTime" : "2020-12-18T14:44:37",
"endTime" : "2021-01-05T14:18:03",
"instrument" : {
"id" : "https://github.com/hbz/lobid-resources",
"type" : [ "SoftwareApplication" ],
Expand Down
15 changes: 6 additions & 9 deletions src/test/resources/alma/HT005207972.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<leader>00000nam#a2200000#c#4500</leader>
<?xml version = "1.0" encoding = "UTF-8"?><record>
<leader>00000nam#a2200000#c#4500</leader>
<controlfield tag="003">DE-605</controlfield>
<controlfield tag="005">20130124165800.0</controlfield>
<controlfield tag="007">tu</controlfield>
Expand Down Expand Up @@ -63,7 +61,7 @@
<subfield code="a">XIV, 633 S.</subfield>
</datafield>
<datafield tag="689" ind1="0" ind2="0">
<subfield code="a">Arbeitsökonomie</subfield>
<subfield code="a">Arbeits&#246;konomie</subfield>
<subfield code="D">s</subfield>
<subfield code="0">(DE-588)4322126-9</subfield>
</datafield>
Expand Down Expand Up @@ -117,7 +115,7 @@
<datafield tag="MBD" ind1=" " ind2=" ">
<subfield code="M">49HBZ_BIE</subfield>
<subfield code="i">991020238039706442</subfield>
<subfield code="n">Universität Bielefeld</subfield>
<subfield code="n">Universit&#228;t Bielefeld</subfield>
</datafield>
<datafield tag="980" ind1="1" ind2=" ">
<subfield code="9">LOCAL</subfield>
Expand Down Expand Up @@ -198,6 +196,5 @@
<subfield code="c">SL800 M367(3)</subfield>
<subfield code="b">138_1155937+01</subfield>
<subfield code="M">49HBZ_BIE</subfield>
</datafield>
</record>
</collection>
</datafield>
</record>
2 changes: 1 addition & 1 deletion src/test/resources/alma/HT012734833.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
},
"resultOf" : {
"type" : [ "CreateAction" ],
"endTime" : "2020-12-18T14:44:37",
"endTime" : "2021-01-05T14:18:03",
"instrument" : {
"id" : "https://github.com/hbz/lobid-resources",
"type" : [ "SoftwareApplication" ],
Expand Down
29 changes: 13 additions & 16 deletions src/test/resources/alma/HT012734833.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<leader>00000nms#a2200000#c#4500</leader>
<?xml version = "1.0" encoding = "UTF-8"?><record>
<leader>00000nms#a2200000#c#4500</leader>
<controlfield tag="003">DE-605</controlfield>
<controlfield tag="005">20191010224400.0</controlfield>
<controlfield tag="007">cr#|||||||||||</controlfield>
Expand Down Expand Up @@ -102,7 +100,7 @@
<subfield code="i">1989</subfield>
</datafield>
<datafield tag="591" ind1=" " ind2=" ">
<subfield code="a">C!URL-Ä(19-11-14)</subfield>
<subfield code="a">C!URL-&#196;(19-11-14)</subfield>
</datafield>
<datafield tag="776" ind1="0" ind2="8">
<subfield code="i">Druckausg.</subfield>
Expand All @@ -122,7 +120,7 @@
<datafield tag="856" ind1="4" ind2="0">
<subfield code="u">http://gateway.ovid.com/ovidweb.cgi?T=JS&amp;NEWS=N&amp;PAGE=toc&amp;SEARCH=00008877-000000000-00000.kc&amp;LINKTYPE=asBody&amp;LINKPOS=1&amp;D=ovft</subfield>
<subfield code="x">Verlag; 1.1989 - 15.2004</subfield>
<subfield code="z">Deutschlandweit zugänglich</subfield>
<subfield code="z">Deutschlandweit zug&#228;nglich</subfield>
</datafield>
<datafield tag="912" ind1=" " ind2=" ">
<subfield code="a">ZDB-1-LWW</subfield>
Expand Down Expand Up @@ -229,7 +227,7 @@
<subfield code="a">5333897520006444</subfield>
<subfield code="M">49HBZ_FHA</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 14:31:47 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 14:31:42 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5333897520006444&amp;Force_direct=true</subfield>
Expand All @@ -247,7 +245,7 @@
<subfield code="a">5333897560006444</subfield>
<subfield code="M">49HBZ_FHA</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 14:31:47 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 14:31:42 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5333897560006444&amp;Force_direct=true</subfield>
Expand All @@ -265,7 +263,7 @@
<subfield code="a">5333897540006444</subfield>
<subfield code="M">49HBZ_FHA</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 14:31:47 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 14:31:42 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5333897540006444&amp;Force_direct=true</subfield>
Expand All @@ -283,7 +281,7 @@
<subfield code="a">5333897500006444</subfield>
<subfield code="M">49HBZ_FHA</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 14:31:47 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 14:31:42 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5333897500006444&amp;Force_direct=true</subfield>
Expand All @@ -301,7 +299,7 @@
<subfield code="a">5375175020006445</subfield>
<subfield code="M">49HBZ_UBD</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 13:09:58 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 13:09:49 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5375175020006445&amp;Force_direct=true</subfield>
Expand All @@ -319,7 +317,7 @@
<subfield code="a">5375174980006445</subfield>
<subfield code="M">49HBZ_UBD</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 13:09:58 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 13:09:49 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5375174980006445&amp;Force_direct=true</subfield>
Expand All @@ -337,7 +335,7 @@
<subfield code="a">5375175000006445</subfield>
<subfield code="M">49HBZ_UBD</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 13:09:58 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 13:09:49 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5375175000006445&amp;Force_direct=true</subfield>
Expand All @@ -355,7 +353,7 @@
<subfield code="a">5375174960006445</subfield>
<subfield code="M">49HBZ_UBD</subfield>
<subfield code="c">static</subfield>
<subfield code="l">Deutschlandweit zugänglich</subfield>
<subfield code="l">Deutschlandweit zug&#228;nglich</subfield>
<subfield code="y">2020-07-23 13:09:58 Europe/Berlin</subfield>
<subfield code="w">2020-07-23 13:09:49 Europe/Berlin</subfield>
<subfield code="D">https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&amp;portfolio_pid=5375174960006445&amp;Force_direct=true</subfield>
Expand All @@ -368,5 +366,4 @@
<subfield code="f">BOOK</subfield>
<subfield code="8">5375174960006445</subfield>
</datafield>
</record>
</collection>
</record>
Binary file not shown.
2 changes: 1 addition & 1 deletion src/test/resources/alma/KUR00770801.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
},
"resultOf" : {
"type" : [ "CreateAction" ],
"endTime" : "2020-12-18T14:44:37",
"endTime" : "2021-01-05T14:18:02",
"instrument" : {
"id" : "https://github.com/hbz/lobid-resources",
"type" : [ "SoftwareApplication" ],
Expand Down
13 changes: 5 additions & 8 deletions src/test/resources/alma/KUR00770801.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<?xml version = "1.0" encoding = "UTF-8"?><record>
<leader>00000nam#a2200000#c#4500</leader>
<controlfield tag="005">20200518164500.0</controlfield>
<controlfield tag="007">cu#|||||||||||</controlfield>
Expand Down Expand Up @@ -34,7 +32,7 @@
</datafield>
<datafield tag="264" ind1=" " ind2="1">
<subfield code="a">Sebastopol, Calif.</subfield>
<subfield code="b">O'Reilly Media</subfield>
<subfield code="b">O&apos;Reilly Media</subfield>
<subfield code="c">c2010</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
Expand Down Expand Up @@ -72,7 +70,7 @@
<datafield tag="856" ind1="4" ind2=" ">
<subfield code="z">Connect to this resource online</subfield>
<subfield code="u">http://proquest.tech.safaribooksonline.de/?uiCode=Duesseldorf&amp;xmlId=9780735656260</subfield>
<subfield code="z">Zugriff nur im Hochschulnetz der Universität Düsseldorf</subfield>
<subfield code="z">Zugriff nur im Hochschulnetz der Universit&#228;t D&#252;sseldorf</subfield>
</datafield>
<datafield tag="980" ind1="1" ind2=" ">
<subfield code="e">safari-2019</subfield>
Expand All @@ -81,7 +79,7 @@
<datafield tag="MBD" ind1=" " ind2=" ">
<subfield code="M">49HBZ_DUE</subfield>
<subfield code="i">990042506810206443</subfield>
<subfield code="n">Universität Düsseldorf</subfield>
<subfield code="n">Universit&#228;t D&#252;sseldorf</subfield>
</datafield>
<datafield tag="MNG" ind1=" " ind2=" ">
<subfield code="f">ILS</subfield>
Expand All @@ -93,5 +91,4 @@
<subfield code="a">import</subfield>
<subfield code="b">2020-07-01 12:09:43 Europe/Berlin</subfield>
</datafield>
</record>
</collection>
</record>

0 comments on commit 23c0045

Please sign in to comment.