diff --git a/examples/oaiPmh/openOaiPmh-zdbIsil.flux b/examples/oaiPmh/openOaiPmh-zdbIsil.flux new file mode 100644 index 000000000..5bcf948a1 --- /dev/null +++ b/examples/oaiPmh/openOaiPmh-zdbIsil.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +// beware: to use this URL your IP has to be allowed by registration +"http://services.d-nb.de/oai/repository" | +open-oaipmh(dateFrom="2013-08-11",dateUntil="2013-08-12",metadataPrefix="PicaPlus-xml",setSpec="bib") | +decode-xml | +handle-picaxml | +encode-formeta(style="multiline")| +write("stdout"); \ No newline at end of file diff --git a/examples/read/mab2/HT010726584.xml.bz2 b/examples/read/mab2/HT010726584.xml.bz2 new file mode 100644 index 000000000..fbb68ad3c Binary files /dev/null and b/examples/read/mab2/HT010726584.xml.bz2 differ diff --git a/examples/read/mab2/mabXml.flux b/examples/read/mab2/mabXml.flux new file mode 100644 index 000000000..e37fc15d5 --- /dev/null +++ b/examples/read/mab2/mabXml.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +files+"HT010726584.xml.bz2"| +open-file(compression="BZIP2") | +decode-xml | +handle-mabxml | +encode-formeta(style="multiline")| +write("stdout"); +}; diff --git a/examples/read/pica/pica.xml.bz2 b/examples/read/pica/pica.xml.bz2 new file mode 100644 index 000000000..9f5cb1fc6 Binary files /dev/null and b/examples/read/pica/pica.xml.bz2 differ diff --git a/examples/read/pica/picaXml.flux b/examples/read/pica/picaXml.flux new file mode 100644 index 000000000..abdc02dba --- /dev/null +++ b/examples/read/pica/picaXml.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +files+"pica.xml.bz2"| +open-file(compression="BZIP2") | +decode-xml | +handle-picaxml | +encode-formeta(style="multiline")| +write("stdout"); +}; \ No newline at end of file diff --git a/examples/read/xmlSplitter/gndRdf.xml.bz2 b/examples/read/xmlSplitter/gndRdf.xml.bz2 new file mode 100644 index 000000000..ea1ba23d0 Binary files /dev/null and b/examples/read/xmlSplitter/gndRdf.xml.bz2 differ diff --git a/examples/read/xmlSplitter/xmlEntitySplitting.flux b/examples/read/xmlSplitter/xmlEntitySplitting.flux new file mode 100644 index 000000000..03fbdf6ed --- /dev/null +++ b/examples/read/xmlSplitter/xmlEntitySplitting.flux @@ -0,0 +1,9 @@ +default files = FLUX_DIR; + +files + "gndRdf.xml.bz2" | +open-file | +decode-xml| +split-xml(entityName="Description",toplevelelement="rdf:RDF")| +extract-literals| +write("stdout") +}; diff --git a/pom.xml b/pom.xml index 9fbc7b933..83d96ceef 100644 --- a/pom.xml +++ b/pom.xml @@ -288,10 +288,19 @@ - - - + + org.dspace + oclc-harvester2 + 0.1.12 + + + xalan + xalan + 2.7.1 + + + diff --git a/src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java b/src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java new file mode 100644 index 000000000..4165660d1 --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandler.java @@ -0,0 +1,83 @@ +/** Copyright 2013,214 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 + **/ + +package org.culturegraph.mf.stream.converter.xml; + +import org.culturegraph.mf.framework.DefaultXmlPipe; +import org.culturegraph.mf.framework.StreamReceiver; +import org.culturegraph.mf.framework.XmlReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * A MAB XML reader. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("A MAB XML reader") +@In(XmlReceiver.class) +@Out(StreamReceiver.class) +public final class MabXmlHandler extends DefaultXmlPipe { + + private static final String SUBFIELD = "subfield"; + private static final String DATAFIELD = "datafield"; + private static final String CONTROLLFIELD = "controlfield"; + private static final String RECORD = "ListRecords"; + private static final String LEADER = "leader"; + private static final String DATAFIELD_ATTRIBUTE = "tag"; + private static final String SUBFIELD_ATTRIBUTE = "code"; + private static final String INDICATOR1 = "ind1"; + private static final String INDICATOR2 = "ind2"; + private String currentTag = ""; + private StringBuilder builder = new StringBuilder(); + + @Override + public void characters(final char[] chars, final int start, final int length) + throws SAXException { + this.builder.append(chars, start, length); + } + + @Override + public void endElement(final String uri, final String localName, final String qName) + throws SAXException { + if (MabXmlHandler.CONTROLLFIELD.equals(localName)) { + getReceiver().literal(this.currentTag, this.builder.toString().trim()); + getReceiver().endEntity(); + } else if (MabXmlHandler.SUBFIELD.equals(localName)) { + getReceiver().literal(this.currentTag, this.builder.toString().trim()); + } else if (MabXmlHandler.DATAFIELD.equals(localName)) { + getReceiver().endEntity(); + } else if (MabXmlHandler.RECORD.equals(localName)) { + getReceiver().endRecord(); + } + } + + @Override + public void startElement(final String uri, final String localName, final String qName, + final Attributes attributes) throws SAXException { + if (MabXmlHandler.CONTROLLFIELD.equals(localName)) { + this.builder = new StringBuilder(); + this.currentTag = ""; + getReceiver().startEntity(attributes.getValue(MabXmlHandler.DATAFIELD_ATTRIBUTE)); + } else if (MabXmlHandler.SUBFIELD.equals(localName)) { + this.builder = new StringBuilder(); + this.currentTag = attributes.getValue(MabXmlHandler.SUBFIELD_ATTRIBUTE); + } else if (MabXmlHandler.DATAFIELD.equals(localName)) { + getReceiver().startEntity( + attributes.getValue(MabXmlHandler.DATAFIELD_ATTRIBUTE) + + attributes.getValue(MabXmlHandler.INDICATOR1) + + attributes.getValue(MabXmlHandler.INDICATOR2)); + } else if (MabXmlHandler.RECORD.equals(localName)) { + getReceiver().startRecord(""); + } else if (MabXmlHandler.LEADER.equals(localName)) { + this.builder = new StringBuilder(); + this.currentTag = MabXmlHandler.LEADER; + } + } + +} diff --git a/src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java b/src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java new file mode 100644 index 000000000..3731f03f4 --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandler.java @@ -0,0 +1,74 @@ +/** Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 + **/ + +package org.culturegraph.mf.stream.converter.xml; + +import java.text.Normalizer; + +import org.culturegraph.mf.framework.DefaultXmlPipe; +import org.culturegraph.mf.framework.StreamReceiver; +import org.culturegraph.mf.framework.XmlReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * A pica xml reader. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("A pica xml reader") +@In(XmlReceiver.class) +@Out(StreamReceiver.class) +public final class PicaXmlHandler extends DefaultXmlPipe { + + private static final String SUBFIELD = "subf"; + private static final String DATAFIELD = "tag"; + private static final String RECORD = "record"; + private static final String NAMESPACE = + "http://www.oclcpica.org/xmlns/ppxml-1.0"; + private static final String LEADER = "global"; + private String currentTag = ""; + private StringBuilder builder = new StringBuilder(); + + @Override + public void startElement(final String uri, final String localName, + final String qName, final Attributes attributes) throws SAXException { + if (SUBFIELD.equals(localName)) { + builder = new StringBuilder(); + currentTag = attributes.getValue("id"); + } else if (DATAFIELD.equals(localName)) { + getReceiver().startEntity( + attributes.getValue("id") + attributes.getValue("occ")); + } else if (RECORD.equals(localName) && NAMESPACE.equals(uri)) { + getReceiver().startRecord(""); + } else if (LEADER.equals(localName)) { + builder = new StringBuilder(); + currentTag = LEADER; + } + } + + @Override + public void endElement(final String uri, final String localName, + final String qName) throws SAXException { + if (SUBFIELD.equals(localName)) { + getReceiver().literal(currentTag, + Normalizer.normalize(builder.toString().trim(), Normalizer.Form.NFC)); + } else if (DATAFIELD.equals(localName)) { + getReceiver().endEntity(); + } else if (RECORD.equals(localName) && NAMESPACE.equals(uri)) { + getReceiver().endRecord(); + } + } + + @Override + public void characters(final char[] chars, final int start, final int length) + throws SAXException { + builder.append(chars, start, length); + } + +} diff --git a/src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java b/src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java new file mode 100644 index 000000000..62892ca18 --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitter.java @@ -0,0 +1,157 @@ +/** Copyright 2013,214 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 + **/ +package org.culturegraph.mf.stream.converter.xml; + +import java.util.HashSet; + +import org.apache.commons.lang.StringEscapeUtils; +import org.culturegraph.mf.framework.DefaultXmlPipe; +import org.culturegraph.mf.framework.StreamReceiver; +import org.culturegraph.mf.framework.XmlReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * An XML entity splitter. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("Splits all entities (aka records) residing in one XML document into multiple single XML documents.") +@In(XmlReceiver.class) +@Out(StreamReceiver.class) +public final class XmlEntitySplitter extends DefaultXmlPipe { + + /** + * Returns the XML declaration which is hard coded. @TODO change that hard + * wired. + * + * @return the XML decalration + */ + public static String getXmlDeclaration() { + return XmlEntitySplitter.XML_DECLARATION; + } + + private String entity; + private StringBuilder builder = new StringBuilder(); + private final HashSet namespaces = new HashSet(); + private boolean inEntity = false; + private int recordCnt = 0; + private String root; + private final static String XML_DECLARATION = ""; + + private int entityDepth = 0; + + private void appendValuesToEntity(final String qName, final Attributes attributes) { + this.builder.append("<" + qName); + if (attributes.getLength() > 0) { + for (int i = 0; i < attributes.getLength(); i++) { + this.builder.append(" " + attributes.getQName(i) + "=\"" + + StringEscapeUtils.escapeXml(attributes.getValue(i)) + "\""); + } + } + + this.builder.append(">"); + } + + @Override + public void characters(final char[] chars, final int start, final int length) + throws SAXException { + try { + this.builder.append(StringEscapeUtils.escapeXml(new String(chars, start, length))); + } catch (final Exception e) { + reset(); + } + } + + @Override + public void endElement(final String uri, final String localName, final String qName) + throws SAXException { + if (this.inEntity) { + this.builder.append(""); + if (this.entity.equals(localName)) { + if (this.entityDepth <= 1) { + final StringBuilder sb = new StringBuilder(XmlEntitySplitter.XML_DECLARATION + + "<" + this.root); + if (this.namespaces != null) { + for (final String ns : this.namespaces) { + sb.append(ns); + } + sb.append(">"); + } + this.builder.insert(0, sb.toString()).append(""); + getReceiver().literal("entity", this.builder.toString()); + getReceiver().endRecord(); + reset(); + return; + } + this.entityDepth--; + } + } + } + + @Override + public void onResetStream() { + reset(); + } + + private void reset() { + this.inEntity = false; + this.builder = new StringBuilder(); + this.entityDepth = 0; + } + + /** + * Sets the name of the entity. All these entities in the XML stream will be + * XML documents on their own. + * + * @param name + * Identifies the entities + */ + public void setEntityName(final String name) { + this.entity = name; + } + + /** + * Sets the top-level XML document element. + * + * @param name + * the element + */ + public void setTopLevelElement(final String name) { + this.root = name; + } + + @Override + public void startElement(final String uri, final String localName, final String qName, + final Attributes attributes) throws SAXException { + if (!this.inEntity) { + if (this.entity.equals(localName)) { + this.builder = new StringBuilder(); + getReceiver().startRecord(String.valueOf(this.recordCnt++)); + this.inEntity = true; + appendValuesToEntity(qName, attributes); + this.entityDepth++; + } else if (this.root == null) { + this.root = qName; + } + } else { + if (this.entity.equals(localName)) { + this.entityDepth++; + } + appendValuesToEntity(qName, attributes); + } + } + + @Override + public void startPrefixMapping(final String prefix, final String uri) throws SAXException { + super.startPrefixMapping(prefix, uri); + if (!prefix.isEmpty() && uri != null) { + this.namespaces.add(" xmlns:" + prefix + "=\"" + uri + "\""); + } + } +} diff --git a/src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java b/src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java new file mode 100644 index 000000000..66f19a23b --- /dev/null +++ b/src/main/java/org/culturegraph/mf/stream/source/OaiPmhOpener.java @@ -0,0 +1,128 @@ +/* Copyright 2013 Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ + +package org.culturegraph.mf.stream.source; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; + +import org.culturegraph.mf.exceptions.MetafactureException; +import org.culturegraph.mf.framework.DefaultObjectPipe; +import org.culturegraph.mf.framework.ObjectReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.culturegraph.mf.stream.source.Opener; +import org.xml.sax.SAXException; + +import ORG.oclc.oai.harvester2.app.RawWrite; + +/** + * Opens an OAI-PMH stream and passes a reader to the receiver. + * + * @author Pascal Christoph (dr0i) + * + */ +@Description("Opens an OAI-PMH stream and passes a reader to the receiver. Mandatory arguments are: BASE_URL, DATE_FROM, DATE_UNTIL, METADATA_PREFIX, SET_SPEC .") +@In(String.class) +@Out(java.io.Reader.class) +public final class OaiPmhOpener extends + DefaultObjectPipe> implements Opener { + + private String encoding = "UTF-8"; + + final ByteArrayOutputStream OUTPUT_STREAM = new ByteArrayOutputStream(); + + private String dateFrom; + + private String dateUntil; + + private String setSpec; + + private String metadataPrefix; + + /** + * Default constructor + */ + public OaiPmhOpener() { + + } + + /** + * Sets the encoding to use. The default setting is UTF-8. + * + * @param encoding new default encoding + */ + public void setEncoding(final String encoding) { + this.encoding = encoding; + } + + /** + * Sets the beginning of the retrieving of updated data. The form is + * YYYY-MM-DD . + * + * @param dateFrom The form is YYYY-MM-DD . + */ + public void setDateFrom(final String dateFrom) { + this.dateFrom = dateFrom; + } + + /** + * Sets the end of the retrieving of updated data. The form is YYYY-MM-DD . + * + * @param dateUntil The form is YYYY-MM-DD . + */ + public void setDateUntil(final String dateUntil) { + this.dateUntil = dateUntil; + } + + /** + * Sets the OAI-PM metadata prefix . + * + * @param metadataPrefix the OAI-PM metadata prefix + */ + public void setMetadataPrefix(final String metadataPrefix) { + this.metadataPrefix = metadataPrefix; + } + + /** + * Sets the OAI-PM set specification . + * + * @param setSpec th OAI-PM set specification + */ + public void setSetSpec(final String setSpec) { + this.setSpec = setSpec; + } + + @Override + public void process(final String baseUrl) { + + try { + RawWrite.run(baseUrl, this.dateFrom, this.dateUntil, this.metadataPrefix, + this.setSpec, OUTPUT_STREAM); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } catch (TransformerException e) { + e.printStackTrace(); + } catch (NoSuchFieldException e) { + e.printStackTrace(); + } + try { + getReceiver().process( + new InputStreamReader(new ByteArrayInputStream(OUTPUT_STREAM + .toByteArray()), encoding)); + } catch (IOException e) { + throw new MetafactureException(e); + } + } +} diff --git a/src/main/resources/flux-commands.properties b/src/main/resources/flux-commands.properties index c37aedba3..bb32a3938 100644 --- a/src/main/resources/flux-commands.properties +++ b/src/main/resources/flux-commands.properties @@ -4,6 +4,7 @@ open-gzip org.culturegraph.mf.stream.source.GzipOpener open-bzip2 org.culturegraph.mf.stream.source.Bzip2Opener open-http org.culturegraph.mf.stream.source.HttpOpener open-resource org.culturegraph.mf.stream.source.ResourceOpener +open-oaipmh org.culturegraph.mf.stream.source.OaiPmhOpener read-string org.culturegraph.mf.stream.source.StringReader read-dir org.culturegraph.mf.stream.source.DirReader read-triples org.culturegraph.mf.stream.source.TripleReader @@ -41,6 +42,9 @@ read-beacon org.culturegraph.mf.stream.reader.BeaconReader handle-cg-xml org.culturegraph.mf.stream.converter.xml.CGXmlHandler handle-generic-xml org.culturegraph.mf.stream.converter.xml.GenericXmlHandler handle-marcxml org.culturegraph.mf.stream.converter.xml.MarcXmlHandler +handle-mabxml org.culturegraph.mf.stream.converter.xml.MabXmlHandler +split-xml org.culturegraph.mf.stream.converter.xml.XmlEntitySplitter +handle-picaxml org.culturegraph.mf.stream.converter.xml.PicaXmlHandler # Encoders: encode-literals org.culturegraph.mf.stream.converter.StreamLiteralFormater @@ -69,6 +73,7 @@ filter-duplicate-objects org.culturegraph.mf.stream.pipe.DuplicateObjectFilter object-tee org.culturegraph.mf.stream.pipe.ObjectTee stream-tee org.culturegraph.mf.stream.pipe.StreamTee +xml-tee org.culturegraph.mf.stream.pipe.XmlTee wait-for-inputs org.culturegraph.mf.stream.pipe.CloseSupressor stream-to-xml org.culturegraph.mf.stream.sink.SimpleXmlWriter diff --git a/src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java b/src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java new file mode 100644 index 000000000..845a9f261 --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/converter/xml/MabXmlHandlerTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.converter.xml; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class MabXmlHandlerTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/read/mab2/mabXml.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +} diff --git a/src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java b/src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java new file mode 100644 index 000000000..723862f67 --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/converter/xml/PicaXmlHandlerTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.converter.xml; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class PicaXmlHandlerTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/read/pica/picaXml.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +} diff --git a/src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java b/src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java new file mode 100644 index 000000000..3cd422c9f --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/converter/xml/XmlEntitySplitterTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.converter.xml; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class XmlEntitySplitterTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/read/xmlSplitter/xmlEntitySplitting.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +} diff --git a/src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java b/src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java new file mode 100644 index 000000000..e5666eb32 --- /dev/null +++ b/src/test/java/org/culturegraph/mf/stream/source/OaiPmhOpenerTest.java @@ -0,0 +1,24 @@ +/* Copyright 2013 hbz, Pascal Christoph. + * Licensed under the Eclipse Public License 1.0 */ +package org.culturegraph.mf.stream.source; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; + +import org.antlr.runtime.RecognitionException; +import org.culturegraph.mf.Flux; +import org.junit.Test; + +/** + * @author Pascal Christoph (dr0i) + * + */ +public final class OaiPmhOpenerTest { + + @Test + public void testFlux() throws URISyntaxException, IOException, RecognitionException { + final File fluxFile = new File("examples/oaiPmh/openOaiPmh-zdbIsil.flux"); + Flux.main(new String[] { fluxFile.getAbsolutePath() }); + } +}