Skip to content

Commit

Permalink
[#5] Refactoring to extract anything from ontology header
Browse files Browse the repository at this point in the history
  • Loading branch information
psiotwo committed Mar 14, 2023
1 parent f5521b6 commit 98b4b1b
Show file tree
Hide file tree
Showing 14 changed files with 107 additions and 69 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ See the [HTML report](https://psiotwo.github.io/ontology-version-extractor/outpu

Currently configured for OBO ontologies:

- `gradle run --args="extract -o output.ttl` will generate `output.ttl` - a Turtle representation of OBO ontology headers and their current versions.
- `gradle run --args="transform -i input.ttl -o output.csv` will generate `output.csv` - a CSV report with the list of OBO ontologies and their current versions.
- `gradle run --args="transform -i input.ttl -o output.html` will generate `output.html` - an HTML report with the list of OBO ontologies and their current versions.
- `gradle run --args="extract -o output.ttl"` will generate `output.ttl` - a Turtle representation of OBO ontology headers and their current versions.
- `gradle run --args="transform -i input.ttl -o output.csv"` will generate `output.csv` - a CSV report with the list of OBO ontologies and their current versions.
- `gradle run --args="transform -i input.ttl -o output.html"` will generate `output.html` - an HTML report with the list of OBO ontologies and their current versions.
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.0.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
4 changes: 1 addition & 3 deletions settings.gradle
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
rootProject.name = 'ontology-version-extractor'
enableFeaturePreview("VERSION_CATALOGS")

rootProject.name = 'ontology-version-extractor'
39 changes: 39 additions & 0 deletions src/main/java/cz/sio2/obo/Extractor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package cz.sio2.obo;

import cz.sio2.obo.extractor.OntologyHeaderExtractor;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static cz.sio2.obo.Constants.NS_OBO_HTTP;

public class Extractor {

private String get(final Pattern pattern, final String singleLine) {
Matcher m = pattern.matcher(singleLine);
if (m.matches()) {
return m.group(1);
} else {
return null;
}
}

private String sanitize(final String input) {
if (input == null) {
return null;
}
return input.replace("&obo;", NS_OBO_HTTP);
}

public OntologyHeader extract(final String s, final OntologyHeaderExtractor extractor) {
final OntologyHeader ontologyHeader = new OntologyHeader();
final String singleLine = s.replace('\n', ' ');
if (!extractor.getFormatMatcher().matcher(singleLine).matches()) {
return null;
}
ontologyHeader.setOwlOntologyIri(sanitize(get(extractor.getIriMatcher(), singleLine)));
ontologyHeader.setOwlVersionIri(sanitize(get(extractor.getVersionIriMatcher(), singleLine)));
ontologyHeader.setOwlVersionInfo(get(extractor.getVersionInfoMatcher(), singleLine));
return ontologyHeader;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import java.util.*;

@Slf4j
public class OBOFoundryVersionExtractor {
public class OBOFoundryHeaderExtractor {

private static void writeRDF(final String file, final Map<String, OntologyHeader> map) throws IOException {
try (final OutputStream os = new FileOutputStream(file)) {
Expand All @@ -47,7 +47,7 @@ private static void writeHTML(final String file, final Map<String, OntologyHeade
private static List<String> getOntologyUrls(final String registry) {
final Model model = ModelFactory.createDefaultModel();
model.read(registry, Lang.TURTLE.toString());
final String queryString = new Scanner(Objects.requireNonNull(OBOFoundryVersionExtractor.class.getResourceAsStream("/get-ontology-purls.rq")), StandardCharsets.UTF_8).useDelimiter("\\A").next();
final String queryString = new Scanner(Objects.requireNonNull(OBOFoundryHeaderExtractor.class.getResourceAsStream("/get-ontology-purls.rq")), StandardCharsets.UTF_8).useDelimiter("\\A").next();
final QueryExecution qe = QueryExecutionFactory
.create(queryString, model);
final ResultSet rs = qe.execSelect();
Expand All @@ -58,7 +58,7 @@ private static List<String> getOntologyUrls(final String registry) {
return list;
}

private static Map<String, OntologyHeader> fetchVersions(final List<String> ontologyUrls, final int headerLength) throws MalformedURLException {
private static Map<String, OntologyHeader> fetchHeaders(final List<String> ontologyUrls, final int headerLength) throws MalformedURLException {
final VersionFetcher f = new VersionFetcher();
final Map<String, OntologyHeader> map = new HashMap<>();
for (final String url : ontologyUrls) {
Expand All @@ -71,35 +71,35 @@ private static Map<String, OntologyHeader> fetchVersions(final List<String> onto

public void extract(final String registryUrl, final String outputFile, final int headerLength) throws IOException {
final List<String> ontologyUrls = getOntologyUrls(registryUrl);
final Map<String, OntologyHeader> ontologyVersions = fetchVersions(ontologyUrls, headerLength);
writeRDF(outputFile, ontologyVersions);
final Map<String, OntologyHeader> ontologyHeaders = fetchHeaders(ontologyUrls, headerLength);
writeRDF(outputFile, ontologyHeaders);
}

private Map<String, OntologyHeader> loadVersions(final String inputFile) {
private Map<String, OntologyHeader> loadHeaders(final String inputFile) {
final Map<String, OntologyHeader> map = new HashMap<>();
final Model model = ModelFactory.createDefaultModel();
model.read(inputFile, Lang.TURTLE.toString());
model.listSubjectsWithProperty(RDF.type, OWL.Ontology).forEach(ontology -> {
final OntologyHeader version = new OntologyHeader();
version.setOwlOntologyIri(ontology.getURI());
final OntologyHeader header = new OntologyHeader();
header.setOwlOntologyIri(ontology.getURI());
final Statement versionIri = ontology.getProperty(OWL2.versionIRI);
if ( versionIri != null ) {
version.setOwlVersionIri(versionIri.getObject().asResource().getURI());
header.setOwlVersionIri(versionIri.getObject().asResource().getURI());
}
final Statement versionInfo = ontology.getProperty(OWL2.versionInfo);
if ( versionInfo != null ) {
version.setOwlVersionInfo(versionInfo.getString());
header.setOwlVersionInfo(versionInfo.getString());
}
map.put(ontology.getURI(), version);
map.put(ontology.getURI(), header);
});
return map;
}

public void transformToCsv(final String inputFile, final String outputFile) throws IOException {
writeCSV(outputFile, loadVersions(inputFile));
writeCSV(outputFile, loadHeaders(inputFile));
}

public void transformToHtml(final String inputFile, final String outputFile) throws IOException {
writeHTML(outputFile, loadVersions(inputFile));
writeHTML(outputFile, loadHeaders(inputFile));
}
}
35 changes: 18 additions & 17 deletions src/main/java/cz/sio2/obo/VersionFetcher.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package cz.sio2.obo;

import cz.sio2.obo.extractor.FSExtractor;
import cz.sio2.obo.extractor.RDFXMLExtractor;
import cz.sio2.obo.extractor.Extractor;
import cz.sio2.obo.extractor.XMLExtractor;
import cz.sio2.obo.extractor.FSOntologyHeaderExtractor;
import cz.sio2.obo.extractor.RDFXMLOntologyHeaderExtractor;
import cz.sio2.obo.extractor.OntologyHeaderExtractor;
import cz.sio2.obo.extractor.XMLOntologyHeaderExtractor;
import lombok.extern.slf4j.Slf4j;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
Expand All @@ -28,22 +28,22 @@
@Slf4j
public class VersionFetcher {

final static List<Extractor> extractors = new ArrayList<>();
final static List<OntologyHeaderExtractor> ONTOLOGY_HEADER_EXTRACTORS = new ArrayList<>();

static {
extractors.add(new RDFXMLExtractor());
extractors.add(new FSExtractor());
extractors.add(new XMLExtractor());
ONTOLOGY_HEADER_EXTRACTORS.add(new RDFXMLOntologyHeaderExtractor());
ONTOLOGY_HEADER_EXTRACTORS.add(new FSOntologyHeaderExtractor());
ONTOLOGY_HEADER_EXTRACTORS.add(new XMLOntologyHeaderExtractor());
}

/**
* Fetches relevant parts of an ontology file which contain version information. Currently, it takes
* Fetches relevant parts of an ontology file which contain header information. Currently, it takes
* - first maxBytes of the document
* - last maxBytes of the document
*
* @param url URL to fetch the document from
* @param maxBytes maximal number of bytes to fetch from each side of the document
* @return Version information from the ontology
* @return header information from the ontology
*/
public OntologyHeader fetch(final URL url, final int maxBytes) {
final RequestConfig cfg = RequestConfig.custom().setConnectTimeout(Timeout.ofMinutes(1)).build();
Expand All @@ -56,13 +56,13 @@ public OntologyHeader fetch(final URL url, final int maxBytes) {
log.info("- range request (second part)");
final String s2 = getRange(httpClient, url, maxBytes, false);
log.info("- done, extracting");
return extractVersion(s1 + s2);
return extract(s1 + s2);
} else {
log.info("- not supporting range request, fetching the whole ontology.");
HttpGet request1 = new HttpGet(url.toString());
final String s1 = extractContentFromResponse(httpClient.execute(request1), maxBytes);
log.info("- done, extracting");
return extractVersion(s1);
return extract(s1);
}
} catch (Exception e) {
log.info("An error occurred during fetching ontology from URL " + url, e);
Expand Down Expand Up @@ -90,11 +90,12 @@ private String extractContentFromResponse(final CloseableHttpResponse response,
}
}

private OntologyHeader extractVersion(final String content) {
final OntologyHeader version = new OntologyHeader();
for (final Extractor e : extractors) {
if (e.extract(content, version)) {
return version;
private OntologyHeader extract(final String content) {
final Extractor e = new Extractor();
for (final OntologyHeaderExtractor ohe : ONTOLOGY_HEADER_EXTRACTORS) {
final OntologyHeader header = e.extract(content, ohe);
if (header != null) {
return header;
}
}
return null;
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/cz/sio2/obo/commands/Extract.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package cz.sio2.obo.commands;

import cz.sio2.obo.OBOFoundryVersionExtractor;
import cz.sio2.obo.OBOFoundryHeaderExtractor;
import lombok.extern.slf4j.Slf4j;
import picocli.CommandLine;

Expand All @@ -27,7 +27,7 @@ class Extract implements Callable<Integer> {
@Override
public Integer call() {
try {
new OBOFoundryVersionExtractor().extract(iri, outputFile, headerSize);
new OBOFoundryHeaderExtractor().extract(iri, outputFile, headerSize);
} catch (Exception e) {
log.error("Error during extraction: ", e);
return -1;
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/cz/sio2/obo/commands/Transform.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package cz.sio2.obo.commands;

import cz.sio2.obo.OBOFoundryVersionExtractor;
import cz.sio2.obo.OBOFoundryHeaderExtractor;
import lombok.extern.slf4j.Slf4j;
import picocli.CommandLine;

Expand All @@ -25,9 +25,9 @@ class Transform implements Callable<Integer> {
public Integer call() {
try {
if (outputFile.endsWith("html")) {
new OBOFoundryVersionExtractor().transformToHtml(inputFile, outputFile);
new OBOFoundryHeaderExtractor().transformToHtml(inputFile, outputFile);
} else {
new OBOFoundryVersionExtractor().transformToCsv(inputFile, outputFile);
new OBOFoundryHeaderExtractor().transformToCsv(inputFile, outputFile);
}
} catch (Exception e) {
log.error("Error during extraction: ", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@

import java.util.regex.Pattern;

public class FSExtractor extends Extractor {
public class FSOntologyHeaderExtractor implements OntologyHeaderExtractor {

@Override
protected Pattern getFormatMatcher() {
public Pattern getFormatMatcher() {
return Pattern.compile(".*Ontology\\(.*");
}

@Override
protected Pattern getIriMatcher() {
public Pattern getIriMatcher() {
return Pattern.compile(".*Ontology\\(<(.+)> <.+?>.*");
}

@Override
protected Pattern getVersionIriMatcher() {
public Pattern getVersionIriMatcher() {
return Pattern.compile(".*Ontology\\(<.+> <(.+?)>.*");
}

@Override
protected Pattern getVersionInfoMatcher() {
public Pattern getVersionInfoMatcher() {
return Pattern.compile(".*versionInfo(.+)xxx");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@

import java.util.regex.Pattern;

public class RDFXMLExtractor extends Extractor {
public class RDFXMLOntologyHeaderExtractor implements OntologyHeaderExtractor {

@Override
protected Pattern getFormatMatcher() {
public Pattern getFormatMatcher() {
return Pattern.compile(".*<rdf:RDF.*");
}

@Override
protected Pattern getIriMatcher() {
public Pattern getIriMatcher() {
return Pattern.compile(".*<[a-zA-Z0-9]*:?Ontology [a-zA-Z0-9]*:?about=\"([^\"]+?)\">.*");
}

@Override
protected Pattern getVersionIriMatcher() {
public Pattern getVersionIriMatcher() {
return Pattern.compile(".*<[a-zA-Z0-9]*:?versionIRI [a-zA-Z0-9]*:?resource=\"([^\"]+?)\"/>.*");
}

@Override
protected Pattern getVersionInfoMatcher() {
public Pattern getVersionInfoMatcher() {
return Pattern.compile(".*<[a-zA-Z0-9]*:?versionInfo[^>]*>(.+?)</[a-zA-Z0-9]*:?versionInfo>.*");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@

import java.util.regex.Pattern;

public class XMLExtractor extends Extractor {
public class XMLOntologyHeaderExtractor implements OntologyHeaderExtractor {

@Override
protected Pattern getFormatMatcher() {
public Pattern getFormatMatcher() {
return Pattern.compile(".*<Ontology.*");
}

@Override
protected Pattern getIriMatcher() {
public Pattern getIriMatcher() {
return Pattern.compile(".*<Ontology .* ontologyIRI=\"([^\"]+?)\".*");
}

@Override
protected Pattern getVersionIriMatcher() {
public Pattern getVersionIriMatcher() {
return Pattern.compile(".*<Ontology [^>]* versionIRI=\"([^\"]+?)\".*");
}

@Override
protected Pattern getVersionInfoMatcher() {
public Pattern getVersionInfoMatcher() {
return Pattern.compile(".*<Annotation>\\s+<AnnotationProperty\\s+abbreviatedIRI=\"owl:versionInfo\"/>\\s+<Literal[^>]*>([^<]+?)</Literal>.*");
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package cz.sio2.obo.extractor;

import cz.sio2.obo.Extractor;
import cz.sio2.obo.OntologyHeader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.params.ParameterizedTest;
Expand All @@ -11,7 +12,7 @@
import java.nio.file.Paths;
import java.util.Objects;

public class TestOWLExtractor {
public class TestOWLOntologyHeaderExtractor {

@ParameterizedTest
@CsvFileSource(resources = "/owl-testcases.csv", numLinesToSkip = 1, delimiter = ',')
Expand All @@ -21,10 +22,9 @@ public void testExtractVersionIriFromXMLExtractsVersionInfoCorrectlyIfPresent(
String versionIri,
String versionInfo) throws URISyntaxException, IOException {
final String s = Files.readString(Paths.get(Objects.requireNonNull(getClass().getResource("/owl-testcases/" + file)).toURI()));
final OntologyHeader version = new OntologyHeader();
new FSExtractor().extract(s, version);
Assertions.assertEquals(ontologyIri, version.getOwlOntologyIri());
Assertions.assertEquals(versionIri, version.getOwlVersionIri());
Assertions.assertEquals(versionInfo, version.getOwlVersionInfo());
final OntologyHeader header = new Extractor().extract(s, new FSOntologyHeaderExtractor());
Assertions.assertEquals(ontologyIri, header.getOwlOntologyIri());
Assertions.assertEquals(versionIri, header.getOwlVersionIri());
Assertions.assertEquals(versionInfo, header.getOwlVersionInfo());
}
}
Loading

0 comments on commit 98b4b1b

Please sign in to comment.