From 06c5e1afc264b13e7275859bf70a936b57ba0f9b Mon Sep 17 00:00:00 2001 From: pascal Christoph Date: Mon, 26 May 2014 15:47:08 +0200 Subject: [PATCH 1/3] Add Mab Xml handler * add a usage example as flux test --- examples/read/mab2/HT010726584.xml.bz2 | Bin 0 -> 2024 bytes examples/read/mab2/mabXml.flux | 9 ++ .../stream/converter/xml/MabXmlHandler.java | 83 ++++++++++++++++++ src/main/resources/flux-commands.properties | 1 + .../converter/xml/MabXmlHandlerTest.java | 24 +++++ 5 files changed, 117 insertions(+) create mode 100644 examples/read/mab2/HT010726584.xml.bz2 create mode 100644 examples/read/mab2/mabXml.flux create mode 100644 src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java create mode 100644 src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java diff --git a/examples/read/mab2/HT010726584.xml.bz2 b/examples/read/mab2/HT010726584.xml.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..fbb68ad3c8bd4938f74912a90c426b63a97d7664 GIT binary patch literal 2024 zcmV$zXTWBkdfAP2K~`n?9hOq7x}M6Oj+_@()Ed}Tso2pez-4AMvj)bjbW=>s@hvTVjE zN!_+j)ewfNQ%WvaL`Qu9*7y6G-cBrKW!-`dgpeggNNGz?xPjY20{;&$N%lZK!Uej!g$ut-J?2>yCZL9~0%!z`2s03R zm{bN74kNNTFXh)WM}Dc+A{%7*q@Fr3(KM3#{9D+<*0qkc`mL}m;awZ>z=nd5K}wVI z)j?qx)lxA>Ixwm^Jq;U|D=o&`rp5;DojNigO>w$09Vtbv`Eh$33*?ta1ohuV*=o88mq@xJ&VX)_@$*z!8XT3AFs!C(B3S^vz@SRC zoDfDiOL#zygz7qQB{s;yFSLXp1~gf(Wi%Pi0uTTv>86;tn77qK7Ex0ZAlEKU?8`zq zRaVYfC~2*e*`-ZS6C7`a;R4#BTA=}&sEI{HwK{0Dpn!k~mwP7%rQcs|47*EvUe>wB zY!aflP3)S=Kur+O>!{aBN~xUfdbMFqm{~IGE!bKJK~Gh+1e`(;1mM>R1hh>B2Q1+u zYQ(^d2os1g5VW2aA`>jG*e;x@7_6m81|oJ%lttG-O1OfSpc8K?I!eMPebloBLM*yr zJMmK<6ADFQSynnoXHjn_Xrbu+e8=bJYCtld5AkoiPseSDpaNo1japhQuVTXJcEcAV zziJVhmGKlwr#hoJE6`J#=B5caJuSCS0SotVs!xgMuLSZ$QV@yiBqV|my114TK)4MQ z%33IS1s1Jh=D*IlDXipr#UmUOAjWV&LB_kU74Jiz%wp({$$Xpro}R2e5#7(_JRPb6 zo^zc5>p&nu{81SQhgg9nMj0Rw$XEzLniLX21yVp~mZ}7ZI>-Up4i-rYTqE5RjK-klJ|=1Gh22K$~7Z|Y##G%)nmHBO= z3oi_r%99yodyA4-*!dx}oBS2Oj>er(@ytu$@yjA@$7eqVoRuEfOyF&eQ%qWIBPWt~ zde;#O1`vvF8iOB?Fxe9@VeWa2IftqK)q0LcRskx$bs z@!IhUMWyn4qh|dM3#wgF?d!ydJ==<)@dN;%b`z3<6r1j}$alUlTjctn9Ca9FaQ;WM z$(QtydMi&ViU%oq>SC#8tKak3n5m@rih%mi4Wb6b0$~CKSLqNZsxF4!0F`BYkOj5c zs0>c-0-dWRv~ymDr9fvZnW!N=a{)l~)F2rv1p*;K2q8t8a3~i7flsH;7=fWs8;e66 z%-awOt}*bGBPgN8KsAUGVNf07qfRWG1_9Dojy2`5C%h=8PPo3=O$=@XL#C2zofjEHCgo{uPWJ3N~!bguyGl&^5 z*q|FELAI4ZL>c{v1&9XrdT)v-oVVaZb`K8@{2-uA4-F43mRgoYf(Fpumx!j7191e6 z)#xZ7am}f>) GZ+{?U6pW4l literal 0 HcmV?d00001 diff --git a/examples/read/mab2/mabXml.flux b/examples/read/mab2/mabXml.flux new file mode 100644 index 000000000..e37fc15d5 --- /dev/null +++ b/examples/read/mab2/mabXml.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +files+"HT010726584.xml.bz2"| +open-file(compression="BZIP2") | +decode-xml | +handle-mabxml | +encode-formeta(style="multiline")| +write("stdout"); +}; diff --git a/src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java b/src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java new file mode 100644 index 000000000..4165660d1 --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java @@ -0,0 +1,83 @@ +/** Copyright 2013,214 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 + **/ + +package org.culturegraph.mf.stream.converter.xml; + +import org.culturegraph.mf.framework.DefaultXmlPipe; +import org.culturegraph.mf.framework.StreamReceiver; +import org.culturegraph.mf.framework.XmlReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * A MAB XML reader. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("A MAB XML reader") +@In(XmlReceiver.class) +@Out(StreamReceiver.class) +public final class MabXmlHandler extends DefaultXmlPipe { + + private static final String SUBFIELD = "subfield"; + private static final String DATAFIELD = "datafield"; + private static final String CONTROLLFIELD = "controlfield"; + private static final String RECORD = "ListRecords"; + private static final String LEADER = "leader"; + private static final String DATAFIELD_ATTRIBUTE = "tag"; + private static final String SUBFIELD_ATTRIBUTE = "code"; + private static final String INDICATOR1 = "ind1"; + private static final String INDICATOR2 = "ind2"; + private String currentTag = ""; + private StringBuilder builder = new StringBuilder(); + + @Override + public void characters(final char[] chars, final int start, final int length) + throws SAXException { + this.builder.append(chars, start, length); + } + + @Override + public void endElement(final String uri, final String localName, final String qName) + throws SAXException { + if (MabXmlHandler.CONTROLLFIELD.equals(localName)) { + getReceiver().literal(this.currentTag, this.builder.toString().trim()); + getReceiver().endEntity(); + } else if (MabXmlHandler.SUBFIELD.equals(localName)) { + getReceiver().literal(this.currentTag, this.builder.toString().trim()); + } else if (MabXmlHandler.DATAFIELD.equals(localName)) { + getReceiver().endEntity(); + } else if (MabXmlHandler.RECORD.equals(localName)) { + getReceiver().endRecord(); + } + } + + @Override + public void startElement(final String uri, final String localName, final String qName, + final Attributes attributes) throws SAXException { + if (MabXmlHandler.CONTROLLFIELD.equals(localName)) { + this.builder = new StringBuilder(); + this.currentTag = ""; + getReceiver().startEntity(attributes.getValue(MabXmlHandler.DATAFIELD_ATTRIBUTE)); + } else if (MabXmlHandler.SUBFIELD.equals(localName)) { + this.builder = new StringBuilder(); + this.currentTag = attributes.getValue(MabXmlHandler.SUBFIELD_ATTRIBUTE); + } else if (MabXmlHandler.DATAFIELD.equals(localName)) { + getReceiver().startEntity( + attributes.getValue(MabXmlHandler.DATAFIELD_ATTRIBUTE) + + attributes.getValue(MabXmlHandler.INDICATOR1) + + attributes.getValue(MabXmlHandler.INDICATOR2)); + } else if (MabXmlHandler.RECORD.equals(localName)) { + getReceiver().startRecord(""); + } else if (MabXmlHandler.LEADER.equals(localName)) { + this.builder = new StringBuilder(); + this.currentTag = MabXmlHandler.LEADER; + } + } + +} diff --git a/src/main/resources/flux-commands.properties b/src/main/resources/flux-commands.properties index c37aedba3..473d180d9 100644 --- a/src/main/resources/flux-commands.properties +++ b/src/main/resources/flux-commands.properties @@ -41,6 +41,7 @@ read-beacon org.culturegraph.mf.stream.reader.BeaconReader handle-cg-xml org.culturegraph.mf.stream.converter.xml.CGXmlHandler handle-generic-xml org.culturegraph.mf.stream.converter.xml.GenericXmlHandler handle-marcxml org.culturegraph.mf.stream.converter.xml.MarcXmlHandler +handle-mabxml org.culturegraph.mf.stream.converter.xml.MabXmlHandler # Encoders: encode-literals org.culturegraph.mf.stream.converter.StreamLiteralFormater diff --git a/src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java b/src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java new file mode 100644 index 000000000..845a9f261 --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.converter.xml; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class MabXmlHandlerTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/read/mab2/mabXml.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +} From d97d15db3fe358888b98d6217d8e076717969a6a Mon Sep 17 00:00:00 2001 From: pascal Christoph Date: Mon, 26 May 2014 16:39:31 +0200 Subject: [PATCH 2/3] Add xml entity splitter, add an example as flow test --- examples/read/xmlSplitter/gndRdf.xml.bz2 | Bin 0 -> 882 bytes .../read/xmlSplitter/xmlEntitySplitting.flux | 9 + .../converter/xml/XmlEntitySplitter.java | 157 ++++++++++++++++++ src/main/resources/flux-commands.properties | 2 + .../converter/xml/XmlEntitySplitterTest.java | 24 +++ 5 files changed, 192 insertions(+) create mode 100644 examples/read/xmlSplitter/gndRdf.xml.bz2 create mode 100644 examples/read/xmlSplitter/xmlEntitySplitting.flux create mode 100644 src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java create mode 100644 src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java diff --git a/examples/read/xmlSplitter/gndRdf.xml.bz2 b/examples/read/xmlSplitter/gndRdf.xml.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..ea1ba23d0bcec5e279f676f7083c92da2a6cb21b GIT binary patch literal 882 zcmV-&1C9JbT4*^jL0KkKSxTGT0RRCP-+%xRS!e(M@2y_IKmXt0Py?gegLD8ILPbF| zo`gM5MD;yP)C_3=0D7BIpn8KOlNy+S&}0ArG-3b%0#ituPgL|rqB;KY>MLdm9Q`7+ThJa#e>Uxx+utOTz5)FOOuPbw2 zq7vBAaOQ?9+L#{(xqW#u^!Qa*0NZ1&=;oeS(nJ(dj>`r{RRYXJeQCuyoZP0h)>A1l zi6ZWJ3rsY)o=f>c29%Qq`nV)Mx66Q>$GKPyvaDg=txgF5963NT!@TCJX%|VwJ*^qUptiBsWbCzQ6I}HIZ5cyK1X1r7y zX=g5tN<53W>|XhG2LC2WDe%VAaf-cXrr-Ndk}F7(&S*A<>%Jm6f;k!xifDFtyXHIy%O zGoAFYcgWGRyGbKeCeGh;pt_#OclD&jCFi1s4(X&{vM$bzab_m=@|V8XGtcxoGM{(yNe0IcE)NpPy z>Mtzl+D$!JFq`Ug8uaX)D`|b@X0SCf?D$ zzBFY@+g9ZfOXVGL7aZ;$jr@J4=w~U!-7Li%+O&>9#u6ML7(Ar}F=1M;Fr)lk$rRy2 IK`L)_1n@wg6951J literal 0 HcmV?d00001 diff --git a/examples/read/xmlSplitter/xmlEntitySplitting.flux b/examples/read/xmlSplitter/xmlEntitySplitting.flux new file mode 100644 index 000000000..03fbdf6ed --- /dev/null +++ b/examples/read/xmlSplitter/xmlEntitySplitting.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +files + "gndRdf.xml.bz2" | +open-file | +decode-xml| +split-xml(entityName="Description",toplevelelement="rdf:RDF")| +extract-literals| +write("stdout") +}; diff --git a/src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java b/src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java new file mode 100644 index 000000000..62892ca18 --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java @@ -0,0 +1,157 @@ +/** Copyright 2013,214 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 + **/ +package org.culturegraph.mf.stream.converter.xml; + +import java.util.HashSet; + +import org.apache.commons.lang.StringEscapeUtils; +import org.culturegraph.mf.framework.DefaultXmlPipe; +import org.culturegraph.mf.framework.StreamReceiver; +import org.culturegraph.mf.framework.XmlReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * An XML entity splitter. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("Splits all entities (aka records) residing in one XML document into multiple single XML documents.") +@In(XmlReceiver.class) +@Out(StreamReceiver.class) +public final class XmlEntitySplitter extends DefaultXmlPipe { + + /** + * Returns the XML declaration which is hard coded. @TODO change that hard + * wired. + * + * @return the XML decalration + */ + public static String getXmlDeclaration() { + return XmlEntitySplitter.XML_DECLARATION; + } + + private String entity; + private StringBuilder builder = new StringBuilder(); + private final HashSet namespaces = new HashSet(); + private boolean inEntity = false; + private int recordCnt = 0; + private String root; + private final static String XML_DECLARATION = ""; + + private int entityDepth = 0; + + private void appendValuesToEntity(final String qName, final Attributes attributes) { + this.builder.append("<" + qName); + if (attributes.getLength() > 0) { + for (int i = 0; i < attributes.getLength(); i++) { + this.builder.append(" " + attributes.getQName(i) + "=\"" + + StringEscapeUtils.escapeXml(attributes.getValue(i)) + "\""); + } + } + + this.builder.append(">"); + } + + @Override + public void characters(final char[] chars, final int start, final int length) + throws SAXException { + try { + this.builder.append(StringEscapeUtils.escapeXml(new String(chars, start, length))); + } catch (final Exception e) { + reset(); + } + } + + @Override + public void endElement(final String uri, final String localName, final String qName) + throws SAXException { + if (this.inEntity) { + this.builder.append(""); + if (this.entity.equals(localName)) { + if (this.entityDepth <= 1) { + final StringBuilder sb = new StringBuilder(XmlEntitySplitter.XML_DECLARATION + + "<" + this.root); + if (this.namespaces != null) { + for (final String ns : this.namespaces) { + sb.append(ns); + } + sb.append(">"); + } + this.builder.insert(0, sb.toString()).append(""); + getReceiver().literal("entity", this.builder.toString()); + getReceiver().endRecord(); + reset(); + return; + } + this.entityDepth--; + } + } + } + + @Override + public void onResetStream() { + reset(); + } + + private void reset() { + this.inEntity = false; + this.builder = new StringBuilder(); + this.entityDepth = 0; + } + + /** + * Sets the name of the entity. All these entities in the XML stream will be + * XML documents on their own. + * + * @param name + * Identifies the entities + */ + public void setEntityName(final String name) { + this.entity = name; + } + + /** + * Sets the top-level XML document element. + * + * @param name + * the element + */ + public void setTopLevelElement(final String name) { + this.root = name; + } + + @Override + public void startElement(final String uri, final String localName, final String qName, + final Attributes attributes) throws SAXException { + if (!this.inEntity) { + if (this.entity.equals(localName)) { + this.builder = new StringBuilder(); + getReceiver().startRecord(String.valueOf(this.recordCnt++)); + this.inEntity = true; + appendValuesToEntity(qName, attributes); + this.entityDepth++; + } else if (this.root == null) { + this.root = qName; + } + } else { + if (this.entity.equals(localName)) { + this.entityDepth++; + } + appendValuesToEntity(qName, attributes); + } + } + + @Override + public void startPrefixMapping(final String prefix, final String uri) throws SAXException { + super.startPrefixMapping(prefix, uri); + if (!prefix.isEmpty() && uri != null) { + this.namespaces.add(" xmlns:" + prefix + "=\"" + uri + "\""); + } + } +} diff --git a/src/main/resources/flux-commands.properties b/src/main/resources/flux-commands.properties index 473d180d9..798a35688 100644 --- a/src/main/resources/flux-commands.properties +++ b/src/main/resources/flux-commands.properties @@ -42,6 +42,7 @@ handle-cg-xml org.culturegraph.mf.stream.converter.xml.CGXmlHandler handle-generic-xml org.culturegraph.mf.stream.converter.xml.GenericXmlHandler handle-marcxml org.culturegraph.mf.stream.converter.xml.MarcXmlHandler handle-mabxml org.culturegraph.mf.stream.converter.xml.MabXmlHandler +split-xml org.culturegraph.mf.stream.converter.xml.XmlEntitySplitter # Encoders: encode-literals org.culturegraph.mf.stream.converter.StreamLiteralFormater @@ -70,6 +71,7 @@ filter-duplicate-objects org.culturegraph.mf.stream.pipe.DuplicateObjectFilter object-tee org.culturegraph.mf.stream.pipe.ObjectTee stream-tee org.culturegraph.mf.stream.pipe.StreamTee +xml-tee org.culturegraph.mf.stream.pipe.XmlTee wait-for-inputs org.culturegraph.mf.stream.pipe.CloseSupressor stream-to-xml org.culturegraph.mf.stream.sink.SimpleXmlWriter diff --git a/src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java b/src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java new file mode 100644 index 000000000..3cd422c9f --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.converter.xml; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class XmlEntitySplitterTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/read/xmlSplitter/xmlEntitySplitting.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +} From 32c9666a941c11a239377fa40a50fbdee4b9475b Mon Sep 17 00:00:00 2001 From: pascal Christoph Date: Mon, 26 May 2014 16:56:19 +0200 Subject: [PATCH 3/3] Add OAI-PMH opener and pica xml handler * add oclc library to handle OAI-PMH * add XPath library needed by oclc OAI-PMH library * add some tests --- examples/oaiPmh/openOaiPmh-zdbIsil.flux | 9 ++ examples/read/pica/pica.xml.bz2 | Bin 0 -> 1676 bytes examples/read/pica/picaXml.flux | 9 ++ pom.xml | 15 +- .../stream/converter/xml/PicaXmlHandler.java | 74 ++++++++++ .../mf/stream/source/OaiPmhOpener.java | 128 ++++++++++++++++++ src/main/resources/flux-commands.properties | 2 + .../converter/xml/PicaXmlHandlerTest.java | 24 ++++ .../mf/stream/source/OaiPmhOpenerTest.java | 24 ++++ 9 files changed, 282 insertions(+), 3 deletions(-) create mode 100644 examples/oaiPmh/openOaiPmh-zdbIsil.flux create mode 100644 examples/read/pica/pica.xml.bz2 create mode 100644 examples/read/pica/picaXml.flux create mode 100644 src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java create mode 100644 src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java create mode 100644 src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java create mode 100644 src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java diff --git a/examples/oaiPmh/openOaiPmh-zdbIsil.flux b/examples/oaiPmh/openOaiPmh-zdbIsil.flux new file mode 100644 index 000000000..5bcf948a1 --- /dev/null +++ b/examples/oaiPmh/openOaiPmh-zdbIsil.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +// beware: to use this URL your IP has to be allowed by registration +"http://services.d-nb.de/oai/repository" | +open-oaipmh(dateFrom="2013-08-11",dateUntil="2013-08-12",metadataPrefix="PicaPlus-xml",setSpec="bib") | +decode-xml | +handle-picaxml | +encode-formeta(style="multiline")| +write("stdout"); \ No newline at end of file diff --git a/examples/read/pica/pica.xml.bz2 b/examples/read/pica/pica.xml.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..9f5cb1fc60618482cb60054a91360acd88d5382f GIT binary patch literal 1676 zcmV;726OpBT4*^jL0KkKS^v)7%>V{t-+*v1d1wFs|L_0WKmXs*UlE18Euw5fsHeGKAC7G&J@nUFLi&>9hhWH6Y900A_@Ndjr0Q#1gH=|`F-glU?dN$MV>$~{M^qd}#^II=h=e(CWY< z3%{DEWKqK)scNP(319k;^A1KAG6q|0BJ#9xbuPnQr}?SIDy_VrH=-Y;a2T7tl*f^U zqc~IfKD_JO-Z_fieV1YAe%;b*n|vIb&M`>x6m~giByUGq0~UoUR}NIpz6*Kb4lp#I zMtd(I!YPty?C_0OikRmA6lhor2MXEGtLj|K-*T9UB{~+Sr-i5)Ols)18W6UKlWUib z&{dp)DjTdUmKyg(uF=kibb-TiAmAWM3#^w#M2JJyqrK(>vD59AS+DO?& zb(E6Y%pyVX!kLX{AKbQ9D9H^dr;Idc`M^hUI=Eo+<$gJ*JIXNjEx@($MgizH)T0`_ z&Ky=O?(mQpi4&VcB9RP|fc3rO*O5`rzbh8$B+5uaCeHHBL@~@d2!nz|Lds@ZW*LO5L7sO(vQkG zuAR=@P4tgdyYsSK_U4~fcLDLJ0Rlepym1%xm)K(n2CiJ7?06Wg;e<3O!ArQHEl2q_ z|KD6`y6oeo?)8ye%YYf0^p4!lukCW4i2cw$40>lC3M4+L8=^x?1{aPow0AAX%Q>zd zCj0jFvyB)T=gZYr*!pyENe4i}=@61+2)`5RqtgsaYVC-)K%JIMpCnzkp7Hn-ndNGpBoa5jjRYJO^lM z42xV!!p?GaBfL0aBej?MM;gg1Fz_ei{Bqqmtztj}OOsaE5Q zHOBs1&45V$zR>Q(-Kwzl;j`oB2uV#t9mzuktT&)U73dL+SX^Go(FGWDl#A_D4W)7; z0{|^Z1VJ#YJP_8CIG-1i+D)H@1Y-34oOSKB@Epf+XcqMTiSEzh;w271#w3FoBC9<~ z#|lF0!pGYUTJU!3tiJa|A$d1e;emm?{$>)CN<^4Aj1m%BQgONB5?E+BY8ZaHd$znZ zCv9X1xkB(}dIVz0$kskEWLUScUi~nutdwuHZrp0_p0y!%* zggA#%tt>=K47K!jQp%F9ocs=wYReO0G;w}x1-(*JI%6g5LAhv2p2o5a2?8d{2KkI< zW2CRsV$C{>*aliaeb%LzH?gh`GAPLE!?B!xpJ;@HSBnzuW;Tj5F)+b{8y;#!_#$AQ zra-8u;B?u`%&n#OB?{}Dw6N!yrS6qMy#hJc*7~+V48rX%T0&iG_zzE3jFP$rAClE6c%9G0{mboQpMN}NsUsW8I6)D2jJ#fgQ zhmKM?0SPWkusZQZ;JC?^*^6qldC=x>K>5t3YnM_u`OTJNt - - - + + org.dspace + oclc-harvester2 + 0.1.12 + + + xalan + xalan + 2.7.1 + + + diff --git a/src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java b/src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java new file mode 100644 index 000000000..3731f03f4 --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java @@ -0,0 +1,74 @@ +/** Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 + **/ + +package org.culturegraph.mf.stream.converter.xml; + +import java.text.Normalizer; + +import org.culturegraph.mf.framework.DefaultXmlPipe; +import org.culturegraph.mf.framework.StreamReceiver; +import org.culturegraph.mf.framework.XmlReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * A pica xml reader. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("A pica xml reader") +@In(XmlReceiver.class) +@Out(StreamReceiver.class) +public final class PicaXmlHandler extends DefaultXmlPipe { + + private static final String SUBFIELD = "subf"; + private static final String DATAFIELD = "tag"; + private static final String RECORD = "record"; + private static final String NAMESPACE = + "http://www.oclcpica.org/xmlns/ppxml-1.0"; + private static final String LEADER = "global"; + private String currentTag = ""; + private StringBuilder builder = new StringBuilder(); + + @Override + public void startElement(final String uri, final String localName, + final String qName, final Attributes attributes) throws SAXException { + if (SUBFIELD.equals(localName)) { + builder = new StringBuilder(); + currentTag = attributes.getValue("id"); + } else if (DATAFIELD.equals(localName)) { + getReceiver().startEntity( + attributes.getValue("id") + attributes.getValue("occ")); + } else if (RECORD.equals(localName) && NAMESPACE.equals(uri)) { + getReceiver().startRecord(""); + } else if (LEADER.equals(localName)) { + builder = new StringBuilder(); + currentTag = LEADER; + } + } + + @Override + public void endElement(final String uri, final String localName, + final String qName) throws SAXException { + if (SUBFIELD.equals(localName)) { + getReceiver().literal(currentTag, + Normalizer.normalize(builder.toString().trim(), Normalizer.Form.NFC)); + } else if (DATAFIELD.equals(localName)) { + getReceiver().endEntity(); + } else if (RECORD.equals(localName) && NAMESPACE.equals(uri)) { + getReceiver().endRecord(); + } + } + + @Override + public void characters(final char[] chars, final int start, final int length) + throws SAXException { + builder.append(chars, start, length); + } + +} diff --git a/src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java b/src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java new file mode 100644 index 000000000..66f19a23b --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java @@ -0,0 +1,128 @@ +/* Copyright 2013 Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ + +package org.culturegraph.mf.stream.source; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; + +import org.culturegraph.mf.exceptions.MetafactureException; +import org.culturegraph.mf.framework.DefaultObjectPipe; +import org.culturegraph.mf.framework.ObjectReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.culturegraph.mf.stream.source.Opener; +import org.xml.sax.SAXException; + +import ORG.oclc.oai.harvester2.app.RawWrite; + +/** + * Opens an OAI-PMH stream and passes a reader to the receiver. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("Opens an OAI-PMH stream and passes a reader to the receiver. Mandatory arguments are: BASE_URL, DATE_FROM, DATE_UNTIL, METADATA_PREFIX, SET_SPEC .") +@In(String.class) +@Out(java.io.Reader.class) +public final class OaiPmhOpener extends + DefaultObjectPipe> implements Opener { + + private String encoding = "UTF-8"; + + final ByteArrayOutputStream OUTPUT_STREAM = new ByteArrayOutputStream(); + + private String dateFrom; + + private String dateUntil; + + private String setSpec; + + private String metadataPrefix; + + /** + * Default constructor + */ + public OaiPmhOpener() { + + } + + /** + * Sets the encoding to use. The default setting is UTF-8. + * + * @param encoding new default encoding + */ + public void setEncoding(final String encoding) { + this.encoding = encoding; + } + + /** + * Sets the beginning of the retrieving of updated data. The form is + * YYYY-MM-DD . + * + * @param dateFrom The form is YYYY-MM-DD . + */ + public void setDateFrom(final String dateFrom) { + this.dateFrom = dateFrom; + } + + /** + * Sets the end of the retrieving of updated data. The form is YYYY-MM-DD . + * + * @param dateUntil The form is YYYY-MM-DD . + */ + public void setDateUntil(final String dateUntil) { + this.dateUntil = dateUntil; + } + + /** + * Sets the OAI-PM metadata prefix . + * + * @param metadataPrefix the OAI-PM metadata prefix + */ + public void setMetadataPrefix(final String metadataPrefix) { + this.metadataPrefix = metadataPrefix; + } + + /** + * Sets the OAI-PM set specification . + * + * @param setSpec th OAI-PM set specification + */ + public void setSetSpec(final String setSpec) { + this.setSpec = setSpec; + } + + @Override + public void process(final String baseUrl) { + + try { + RawWrite.run(baseUrl, this.dateFrom, this.dateUntil, this.metadataPrefix, + this.setSpec, OUTPUT_STREAM); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } catch (TransformerException e) { + e.printStackTrace(); + } catch (NoSuchFieldException e) { + e.printStackTrace(); + } + try { + getReceiver().process( + new InputStreamReader(new ByteArrayInputStream(OUTPUT_STREAM + .toByteArray()), encoding)); + } catch (IOException e) { + throw new MetafactureException(e); + } + } +} diff --git a/src/main/resources/flux-commands.properties b/src/main/resources/flux-commands.properties index 798a35688..bb32a3938 100644 --- a/src/main/resources/flux-commands.properties +++ b/src/main/resources/flux-commands.properties @@ -4,6 +4,7 @@ open-gzip org.culturegraph.mf.stream.source.GzipOpener open-bzip2 org.culturegraph.mf.stream.source.Bzip2Opener open-http org.culturegraph.mf.stream.source.HttpOpener open-resource org.culturegraph.mf.stream.source.ResourceOpener +open-oaipmh org.culturegraph.mf.stream.source.OaiPmhOpener read-string org.culturegraph.mf.stream.source.StringReader read-dir org.culturegraph.mf.stream.source.DirReader read-triples org.culturegraph.mf.stream.source.TripleReader @@ -43,6 +44,7 @@ handle-generic-xml org.culturegraph.mf.stream.converter.xml.GenericXmlHandler handle-marcxml org.culturegraph.mf.stream.converter.xml.MarcXmlHandler handle-mabxml org.culturegraph.mf.stream.converter.xml.MabXmlHandler split-xml org.culturegraph.mf.stream.converter.xml.XmlEntitySplitter +handle-picaxml org.culturegraph.mf.stream.converter.xml.PicaXmlHandler # Encoders: encode-literals org.culturegraph.mf.stream.converter.StreamLiteralFormater diff --git a/src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java b/src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java new file mode 100644 index 000000000..723862f67 --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.converter.xml; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class PicaXmlHandlerTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/read/pica/picaXml.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +} diff --git a/src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java b/src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java new file mode 100644 index 000000000..e5666eb32 --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.source; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class OaiPmhOpenerTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/oaiPmh/openOaiPmh-zdbIsil.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +}