Skip to content

Commit

Permalink
Store dc(t):subject as a list
Browse files Browse the repository at this point in the history
See hbz#4.

Subjects without an ID will be given an explicit BNode ID. They can then be
referenced in the moprh and thus be treated as every other resource.

- store ntriples as files if log level is set to debug
	The ntriples are an intermediate step for producing json. For easier
	debugging it's also nice to have this step temporarily in the filesystem.
- enable explicit given BNodes when encoding triples
  • Loading branch information
dr0i committed Nov 17, 2016
1 parent ffac9ea commit 2c9a965
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 16 deletions.
1 change: 1 addition & 0 deletions .settings/org.eclipse.core.resources.prefs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding//src/test/resources=UTF-8
encoding/<project>=UTF-8
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<dependency>
<groupId>com.github.hbz</groupId>
<artifactId>lobid-rdf-to-json</artifactId>
<version>059d852d97e7866137b4f3001651b65dcc55642e</version>
<version>be2152b644913b0744686fc8c77e7cf584876f10</version>
</dependency>
<dependency>
<groupId>org.culturegraph</groupId>
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/org/lobid/resources/PipeEncodeTriples.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ public class PipeEncodeTriples extends AbstractGraphPipeEncoder {
// dummy subject to store data even if the subject is unknown at first
final static String DUMMY_SUBJECT = "dummy_subject";
final static String HTTP = "^[hH][tT][Tt][Pp].*";
final static String BNODE = "^_:.*";
final static String FTP = "^[Ff][Tt][Pp].*";

final static String URN = "urn";
Expand Down Expand Up @@ -115,7 +116,8 @@ public void literal(final String name, final String value) {
try {
final Property prop = model.createProperty(name);
if (!name.contains(PROPERTY_AS_LITERALS) && (value.matches(HTTP)
|| value.matches(FTP) || (value.startsWith(URN) && storeUrnAsUri)
|| value.matches(BNODE) || value.matches(FTP)
|| (value.startsWith(URN) && storeUrnAsUri)
|| value.startsWith("mailto"))) {
boolean uri = true;
// either add uri ...
Expand Down
147 changes: 147 additions & 0 deletions src/main/java/org/lobid/resources/RdfModelFileWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/* Copyright 2013,2016 Pascal Christoph, hbz.
* Licensed under the Eclipse Public License 1.0 */

package org.lobid.resources;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.NoSuchElementException;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.RDFLanguages;
import org.culturegraph.mf.exceptions.MetafactureException;
import org.culturegraph.mf.framework.DefaultObjectReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.culturegraph.mf.util.xml.FilenameExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hp.hpl.jena.rdf.model.Model;

/**
* A sink, writing triples into files. The filenames are constructed from the
* literal of an given property.
*
* @author Pascal Christoph (dr0i)
*/
@Description("Writes the object value of an RDF model into a file. Default serialization is 'NTRIPLES'. The filename is "
+ "constructed from the literal of an given property (recommended properties are identifier)."
+ " Variable are " + "- 'target' (determining the output directory)"
+ "- 'property' (the property in the RDF model. The object of this property"
+ " will be the main part of the file's name.) "
+ "- 'startIndex' ( a subfolder will be extracted out of the filename. This marks the index' beginning )"
+ "- 'stopIndex' ( a subfolder will be extracted out of the filename. This marks the index' end )"
+ "- 'serialization (e.g. one of 'NTRIPLES', 'TURTLE', 'RDFXML','RDFJSON'")
@In(Model.class)
@Out(Void.class)
public final class RdfModelFileWriter extends DefaultObjectReceiver<Model>
implements FilenameExtractor {
private static final Logger LOG =
LoggerFactory.getLogger(RdfModelFileWriter.class);

private FilenameUtil filenameUtil = new FilenameUtil();
private Lang serialization;

/**
* Default constructor
*/
public RdfModelFileWriter() {
setProperty("http://purl.org/dc/terms/identifier");
setFileSuffix("nt");
setSerialization("NTRIPLES");
}

@Override
public String getEncoding() {
return filenameUtil.encoding;
}

@Override
public void setEncoding(final String encoding) {
filenameUtil.encoding = encoding;
}

@Override
public void setTarget(final String target) {
filenameUtil.target = target;
}

@Override
public void setProperty(final String property) {
filenameUtil.property = property;
}

@Override
public void setFileSuffix(final String fileSuffix) {
filenameUtil.fileSuffix = fileSuffix;
}

@Override
public void setStartIndex(final int startIndex) {
filenameUtil.startIndex = startIndex;
}

@Override
public void setEndIndex(final int endIndex) {
filenameUtil.endIndex = endIndex;
}

/**
* Sets the rdf serialization language.
*
* @param serialization the language to be serialized
*/
public void setSerialization(final String serialization) {
this.serialization = RDFLanguages.nameToLang(serialization);
}

@Override
public void process(final Model model) {
String identifier = null;
try {
identifier =
model
.listObjectsOfProperty(
model.createProperty(filenameUtil.property))
.next().toString();
LOG.debug("Going to store identifier=" + identifier);
} catch (NoSuchElementException e) {
LOG.warn(
"No identifier => cannot derive a filename for " + model.toString());
return;
}

String directory = identifier;
if (directory.length() >= filenameUtil.endIndex) {
directory =
directory.substring(filenameUtil.startIndex, filenameUtil.endIndex);
}
final String file = FilenameUtils.concat(filenameUtil.target,
FilenameUtils.concat(directory + File.separator,
identifier + "." + filenameUtil.fileSuffix));
LOG.debug("Write to " + file);
filenameUtil.ensurePathExists(file);

try (
final Writer writer = new OutputStreamWriter(new FileOutputStream(file),
filenameUtil.encoding)) {
final StringWriter tripleWriter = new StringWriter();
RDFDataMgr.write(tripleWriter, model, this.serialization);
IOUtils.write(tripleWriter.toString(), writer);
writer.close();
} catch (IOException e) {
e.printStackTrace();
throw new MetafactureException(e);
}
}

}
52 changes: 38 additions & 14 deletions src/main/resources/morph-hbz01-to-lobid.xml
Original file line number Diff line number Diff line change
Expand Up @@ -123,23 +123,37 @@
<!-- /altLabel -->
</macros>
<rules>
<entity name="http://purl.org/dc/terms/subject" flushWith="@rdfTypeSubject"
<!-- subject as bnodes and their two properties (label & type )-->
<combine name="@subjectLabel" value="${a}" reset="true" sameEntity="true">
<data source="@rdfTypeSubject">
<regexp match="^http"/>
</data>
<data source="9[01234][27]-[-12].[aefghpstz-]" name="a">
<regexp match="(.*)" format="${1}"/>
</data>
</combine>
<combine value="_:${a}" name="@subjectLabelBnode">
<data source="@subjectLabel" name="a"/>
</combine>
<data source="@subjectLabelBnode" name="~rdf:subject"/>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="${a}"
reset="true" sameEntity="true">
<data source="@rdfTypeSubject" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type">
<data source="@rdfTypeSubject" name="">
<regexp match="^(http.*)"/>
</data>
<combine name="http://www.w3.org/2000/01/rdf-schema#label" value="${a}" reset="true" sameEntity="true">
<data source="@rdfTypeSubject">
<regexp match="^http"/>
</data>
<data source="9[01234][27]-[-12].[aefghpstz-]" name="a">
<regexp match="(.*)" format="${1}"/>
</data>
</combine>
<data source="@rdfTypeSubject" name="a"/>
<data source="@subjectLabelBnode"/>
</combine>
<data source="@subjectLabel" name="http://www.w3.org/2000/01/rdf-schema#label"/>
<!-- /subject as bnodes and their two properties (label & type -->
<combine name="@subjectId" value="${a}">
<data source="9[01234][27]-[-12].9" name="a">
<regexp match="\(DE-588\)(.*)" format="http://d-nb.info/gnd/${1}"/>
</data>
</combine>
<entity name="~rdf:list" reset="true">
<data name="http://purl.org/dc/terms/subject" source="@subjectId|@subjectLabelBnode"/>
</entity>
<data source="9[01234][27]-[-12].9" name="http://purl.org/dc/terms/subject">
<regexp match="\(DE-588\)(.*)" format="http://d-nb.info/gnd/${1}"/>
</data>
<!-- ####################### -->
<!-- ####### Get subject uri of each record -->
<!-- ####################### -->
Expand Down Expand Up @@ -343,8 +357,11 @@
<data source="050." name="@medium">
<regexp match="^........[abcdefgz]" format="http://rdvocab.info/termList/RDACarrierType/1010"/>
</data>
<combine name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type" value="${a}" >
<data source="@typeOnly" name="a"/>
<data source="@id" />
</combine>
<data source="@rdftype" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"/>
<data source="@typeOnly" name="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"/>
<!-- 051: type -->
<data source="051." name="@article">
<regexp match="^a|^.t|^..t|^...t|^.....t" format="$[ns-bibo]Article"/>
Expand Down Expand Up @@ -1428,11 +1445,17 @@
<!-- ########### -->
<!-- # set subject uri of resource anew -->
<!-- ########### -->

<combine name="~rdf:subject" value="$[ns-lobid-resource]${subject}#!">
<data
source="@altLabelPerson|@altLabelCorporateBody|@subjectLinks|@creatorPersonId|@creatorCorporateBodyId|@contributorPersonId|@contributorCorporateBodyId|@subjectTopic|@subjectMain"/>
<data source="@id" name="subject"/>
</combine>





<!-- other subjects -->
<data source="71[01][-abcdfz][123].a" name="http://purl.org/lobid/lv#subjectLabel">
<split delimiter=" / "/>
Expand Down Expand Up @@ -1514,6 +1537,7 @@
<call-macro name="subjectOrder" field="942"/>
<call-macro name="subjectOrder" field="947"/>
<!-- subjects without ID -->

<call-macro name="dctSubject" field="902"/>
<call-macro name="dctSubject" field="907"/>
<call-macro name="dctSubject" field="912"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,13 @@ public final class Hbz01MabXml2ElasticsearchLobidTest {
static final String DIRECTORY_TO_TEST_JSON_FILES = PATH_TO_TEST + "jsonld/";

static boolean testFailed = false;
static final String NTRIPLES_DEBUG_FILES = "src/test/resources/nt";

@BeforeClass
public static void setup() {
if (LOG.isDebugEnabled()) {
etlDebug();
}
node = nodeBuilder().local(true)
.settings(Settings.builder().put("index.number_of_replicas", "0")
.put("index.number_of_shards", "1").put("path.home", "tmp/")
Expand All @@ -92,6 +96,11 @@ public static void etl(final Client cl,
client = cl;
final FileOpener opener = new FileOpener();
final Triples2RdfModel triple2model = new Triples2RdfModel();
RdfModelFileWriter rdfModelFileWriter = new RdfModelFileWriter();
rdfModelFileWriter.setProperty("http://purl.org/lobid/lv#hbzID");
rdfModelFileWriter.setStartIndex(2);
rdfModelFileWriter.setEndIndex(7);
rdfModelFileWriter.setTarget("src/test/resources/nt");
triple2model.setInput(N_TRIPLE);
opener.setReceiver(new TarReader()).setReceiver(new XmlDecoder())
.setReceiver(new AlephMabXmlHandler())
Expand All @@ -104,6 +113,29 @@ public static void etl(final Client cl,
opener.closeStream();
}

/**
* Writes ntriples to the filesystem. Helper for debugging purposes.
*/
public static String etlDebug() {
final FileOpener opener = new FileOpener();
final Triples2RdfModel triple2model = new Triples2RdfModel();
RdfModelFileWriter rdfModelFileWriter = new RdfModelFileWriter();
rdfModelFileWriter.setProperty("http://purl.org/lobid/lv#hbzID");
rdfModelFileWriter.setStartIndex(2);
rdfModelFileWriter.setEndIndex(7);
rdfModelFileWriter.setTarget(NTRIPLES_DEBUG_FILES);
triple2model.setInput(N_TRIPLE);
opener.setReceiver(new TarReader()).setReceiver(new XmlDecoder())
.setReceiver(new AlephMabXmlHandler())
.setReceiver(
new Metamorph("src/main/resources/morph-hbz01-to-lobid.xml"))
.setReceiver(new PipeEncodeTriples()).setReceiver(triple2model)
.setReceiver(rdfModelFileWriter);
opener.process(new File(TEST_FILENAME_ALEPHXMLCLOBS).getAbsolutePath());
opener.closeStream();
return "Created files, see " + NTRIPLES_DEBUG_FILES;
}

@SuppressWarnings("static-method")
@Test
public void testJson() {
Expand Down

0 comments on commit 2c9a965

Please sign in to comment.