Skip to content

Commit

Permalink
add support for gzipped ontologies (#761)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Oct 13, 2024
1 parent 4f3b427 commit a754d90
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.input.TeeInputStream;
Expand Down Expand Up @@ -72,12 +73,21 @@ public void run() {
String mimetype = downloadURL(ontologyUrl, path);

Lang lang = RDFLanguages.contentTypeToLang(mimetype);
if(lang == null) {
lang = RDFLanguages.filenameToLang(ontologyUrl, Lang.RDFXML);
}
if(lang == null) {
lang = Lang.RDFXML;
}

InputStream is = new FileInputStream(path);

if(ontologyUrl.endsWith(".gz")) {
is = new GZIPInputStream(is);
}

// parse to look for imports only
createParser(lang).source(new FileInputStream(path)).parse(new StreamRDF() {
createParser(lang).source(is).parse(new StreamRDF() {
public void start() {}
public void quad(Quad quad) {}
public void base(String base) {}
Expand Down
112 changes: 74 additions & 38 deletions dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
import uk.ac.ebi.rdf2json.properties.*;

import org.apache.jena.riot.Lang;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.Triple;
import org.apache.jena.riot.RDFParser;
Expand All @@ -16,13 +23,19 @@
import org.apache.jena.sparql.core.Quad;

import java.io.IOException;
import java.io.InputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

import static uk.ac.ebi.rdf2json.OntologyNode.NodeType.*;
import static uk.ac.ebi.ols.shared.DefinedFields.*;

Expand Down Expand Up @@ -56,45 +69,68 @@ private RDFParserBuilder createParser(Lang lang) {
}
}

private void parseRDF(String url) {
private void parseRDF(String url, InputStream is, String contentType) throws IOException {

try {
if (loadLocalFiles && !url.contains("://")) {
logger.debug("Using local file for {}", url);
sourceFileTimestamp = new File(url).lastModified();
createParser(RDFLanguages.filenameToLang(url, Lang.RDFXML))
.source(new FileInputStream(url)).parse(this);
} else {
if (downloadedPath != null) {
String existingDownload = downloadedPath + "/" + urlToFilename(url);
try {
FileInputStream is = new FileInputStream(existingDownload);
logger.debug("Using predownloaded file for {}", url);
sourceFileTimestamp = new File(existingDownload).lastModified();
Lang lang = null;
try {
String existingDownloadMimeType = Files.readString(Paths.get(existingDownload + ".mimetype"));
lang = RDFLanguages.contentTypeToLang(existingDownloadMimeType);
} catch(IOException ignored) {
}
if(lang == null) {
lang = Lang.RDFXML;
}
createParser(lang).source(is).parse(this);
} catch (Exception e) {
logger.error("Downloading (not predownloaded) {}", url);
sourceFileTimestamp = System.currentTimeMillis();
createParser(null).source(url).parse(this);
}
} else {
logger.debug("Downloading (no predownload path provided) {}", url);
sourceFileTimestamp = System.currentTimeMillis();
createParser(null).source(url).parse(this);
}
}
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
if(url.endsWith(".gz")) {
System.out.println("parseRDF: Decompressing gzipped ontology " + url);
is = new GZIPInputStream(is);
url = url.substring(0, url.length() - 3);
}

Lang lang = null;
if(contentType != null) {
lang = RDFLanguages.contentTypeToLang(contentType);
}
if(lang == null) {
lang = RDFLanguages.filenameToLang(url, Lang.RDFXML);
}
if(lang == null) {
lang = Lang.RDFXML;
}

createParser(lang).source(is).parse(this);
}

private void parseRDF(String url) throws IOException {

if (loadLocalFiles && !url.contains("://")) {
logger.debug("parseRDF: Using local file for {}", url);
sourceFileTimestamp = new File(url).lastModified();
parseRDF(url, new FileInputStream(url), null);
return;
}

if (downloadedPath != null) {
String existingDownload = downloadedPath + "/" + urlToFilename(url);
InputStream is = new FileInputStream(existingDownload);
logger.debug("parseRDF: Using predownloaded file for {}", url);
sourceFileTimestamp = new File(existingDownload).lastModified();
String existingDownloadMimeType = Files.readString(Paths.get(existingDownload + ".mimetype"));
parseRDF(url, is, existingDownloadMimeType);
return;
}

logger.error("parseRDF: Downloading (not predownloaded) {}", url);
sourceFileTimestamp = System.currentTimeMillis();

HttpEntity res = getURL(url);
InputStream is = res.getContent();
String contentType = res.getContentType().getValue();
parseRDF(url, is, contentType);
}

private static HttpEntity getURL(String url) throws FileNotFoundException, IOException {

RequestConfig config = RequestConfig.custom()
.setConnectTimeout(5000)
.setConnectionRequestTimeout(5000)
.setSocketTimeout(5000).build();

CloseableHttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(config).build();

HttpGet request = new HttpGet(url);
HttpResponse response = client.execute(request);
return response.getEntity();
}

private String urlToFilename(String url) {
Expand All @@ -107,7 +143,7 @@ private String urlToFilename(String url) {
String downloadedPath;


OntologyGraph(Map<String, Object> config, boolean loadLocalFiles, boolean noDates, String downloadedPath) {
OntologyGraph(Map<String, Object> config, boolean loadLocalFiles, boolean noDates, String downloadedPath) throws IOException {

this.loadLocalFiles = loadLocalFiles;
this.downloadedPath = downloadedPath;
Expand Down

0 comments on commit a754d90

Please sign in to comment.