diff --git a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/BulkOntologyDownloader.java b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/BulkOntologyDownloader.java index 1ec6bc821..c53d8f4f3 100644 --- a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/BulkOntologyDownloader.java +++ b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/BulkOntologyDownloader.java @@ -1,79 +1,136 @@ package uk.ac.ebi.ols4.predownloader; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.ArrayList; -import java.util.Set; -import java.util.LinkedHashSet; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +import java.io.FileWriter; +import java.io.IOException; +import java.io.Writer; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; public class BulkOntologyDownloader { static final int NUM_THREADS = 16; - Set urlsToDownload; - Set urlsAlreadyProcessed; + List ontologiesToDownload; + private Set ontologyIdsAlreadyProcessed; String downloadPath; boolean loadLocalFiles; + List updatedOntologyIds; + List unchangedOntologyIds; + private Map previousChecksums; + private Map updatedChecksums; Set threads = new HashSet<>(); - public BulkOntologyDownloader(List ontologyUrls, String downloadPath, boolean loadLocalFiles) { - this.urlsToDownload = new LinkedHashSet(ontologyUrls); - this.urlsAlreadyProcessed = new HashSet<>(); + public BulkOntologyDownloader(List ontologies, + String downloadPath, + boolean loadLocalFiles, + Map previousChecksums) { + this.ontologiesToDownload = new ArrayList<>(ontologies); + this.ontologyIdsAlreadyProcessed = Collections.synchronizedSet(new HashSet<>()); this.downloadPath = downloadPath; this.loadLocalFiles = loadLocalFiles; + this.previousChecksums = previousChecksums; + this.updatedChecksums = new ConcurrentHashMap<>(); + this.updatedOntologyIds = Collections.synchronizedList(new ArrayList<>()); + this.unchangedOntologyIds = Collections.synchronizedList(new ArrayList<>()); } public void downloadAll() { - while(urlsToDownload.size() > 0) { + while (!ontologiesToDownload.isEmpty()) { List threads = new ArrayList<>(); - Set imports = new LinkedHashSet<>(); + Set imports = new LinkedHashSet<>(); for(int i = 0; i < NUM_THREADS; ++ i) { - if(urlsToDownload.size() == 0) { + if (ontologiesToDownload.isEmpty()) { break; } - Iterator it = urlsToDownload.iterator(); - String nextUrl = it.next(); - it.remove(); + Ontology ontology = ontologiesToDownload.remove(0); + + // Check if we've already processed this ontology ID + if (ontologyIdsAlreadyProcessed.contains(ontology.getId())) { + continue; + } - urlsAlreadyProcessed.add(nextUrl); + ontologyIdsAlreadyProcessed.add(ontology.getId()); + + OntologyDownloaderThread downloaderThread = new OntologyDownloaderThread( + this, + ontology, + importedOntologies -> { + synchronized (imports) { + imports.addAll(importedOntologies); + } + }, + previousChecksums, + updatedChecksums, + updatedOntologyIds, + unchangedOntologyIds + ); + + + Thread thread = new Thread(downloaderThread, "Downloader thread " + i); + threads.add(thread); + + thread.start(); + + for (Thread t : threads) { + try { + t.join(); + System.out.println(t.getName() + " finished"); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } - OntologyDownloaderThread downloader = - new OntologyDownloaderThread(this, nextUrl, importUrls -> { - imports.addAll(importUrls); - }); + synchronized (ontologiesToDownload) { + for (Ontology importedOntology : imports) { + if (!ontologyIdsAlreadyProcessed.contains(importedOntology.getId())) { + ontologiesToDownload.add(importedOntology); + } + } + } + } - Thread t = new Thread(downloader, "Downloader thread " + i); - threads.add(t); + saveChecksums(updatedChecksums); + } - t.start(); + private void saveChecksums(Map checksums) { + try (Writer writer = new FileWriter("checksums.json")) { + Gson gson = new GsonBuilder().setPrettyPrinting().create(); + gson.toJson(checksums, writer); + } catch (IOException e) { + System.err.println("Error writing checksums.json: " + e.getMessage()); } + } - for(Thread t : threads) { - try { - t.join(); - System.out.println(t.getName() + " finished"); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + private void printUpdateSummary() { + System.out.println("\nUpdate Summary:"); + System.out.println("Total ontologies processed: " + (updatedOntologyIds.size() + unchangedOntologyIds.size())); + System.out.println("Ontologies updated: " + updatedOntologyIds.size()); + System.out.println("Ontologies unchanged: " + unchangedOntologyIds.size()); + + if (!updatedOntologyIds.isEmpty()) { + System.out.println("\nUpdated Ontologies:"); + for (String id : updatedOntologyIds) { + System.out.println(" - " + id); } } - for(String importUrl : imports) { - if(!urlsAlreadyProcessed.contains(importUrl)) { - urlsAlreadyProcessed.add(importUrl); - urlsToDownload.add(importUrl); + if (!unchangedOntologyIds.isEmpty()) { + System.out.println("\nUnchanged Ontologies:"); + for (String id : unchangedOntologyIds) { + System.out.println(" - " + id); } } } - } - } diff --git a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Downloader.java b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Downloader.java index dcff958b6..1523edd90 100644 --- a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Downloader.java +++ b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Downloader.java @@ -1,5 +1,6 @@ package uk.ac.ebi.ols4.predownloader; +import com.google.common.reflect.TypeToken; import com.google.gson.Gson; import com.google.gson.stream.JsonReader; import com.google.gson.stream.JsonToken; @@ -8,16 +9,9 @@ import org.apache.commons.cli.*; import java.io.*; +import java.lang.reflect.Type; import java.net.URL; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; +import java.util.*; import java.util.stream.Collectors; public class Downloader { @@ -83,6 +77,18 @@ public static void main(String[] args) throws IOException { }).collect(Collectors.toList()); + Map previousChecksums = new HashMap<>(); + File checksumFile = new File("checksums.json"); + if (checksumFile.exists()) { + try (Reader reader = new FileReader(checksumFile)) { + Type type = new TypeToken>() {}.getType(); + previousChecksums = gson.fromJson(reader, type); + } catch (IOException e) { + System.err.println("Error reading checksums.json: " + e.getMessage()); + } + } + + LinkedHashMap> mergedConfigs = new LinkedHashMap<>(); @@ -108,9 +114,11 @@ public static void main(String[] args) throws IOException { Set ontologyUrls = new LinkedHashSet<>(); + List ontologyList = new ArrayList<>(); for(Map config : mergedConfigs.values()) { + String ontologyId = ((String) config.get("id")).toLowerCase(); String url = (String) config.get("ontology_purl"); if(url == null) { @@ -132,12 +140,13 @@ public static void main(String[] args) throws IOException { } } - if(url != null) - ontologyUrls.add(url); + if (url != null) { + ontologyList.add(new Ontology(ontologyId, url)); + } } - BulkOntologyDownloader downloader = new BulkOntologyDownloader(List.copyOf(ontologyUrls), downloadPath, bLoadLocalFiles); + BulkOntologyDownloader downloader = new BulkOntologyDownloader(ontologyList, downloadPath, bLoadLocalFiles, previousChecksums); downloader.downloadAll(); diff --git a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Ontology.java b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Ontology.java new file mode 100644 index 000000000..eecbdbc8e --- /dev/null +++ b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/Ontology.java @@ -0,0 +1,20 @@ +package uk.ac.ebi.ols4.predownloader; + +public class Ontology { + private String id; + private String url; + + public Ontology(String id, String url) { + this.id = id; + this.url = url; + } + + public String getId() { + return id; + } + + public String getUrl() { + return url; + } +} + diff --git a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java index f8a01b28f..616a9d606 100644 --- a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java +++ b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java @@ -14,11 +14,9 @@ import java.nio.file.OpenOption; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; import java.util.function.Consumer; import java.util.function.Function; @@ -43,64 +41,79 @@ public class OntologyDownloaderThread implements Runnable { BulkOntologyDownloader downloader; - String ontologyUrl; - Consumer> consumeImports; - - public OntologyDownloaderThread(BulkOntologyDownloader downloader, String ontologyUrl, Consumer> consumeImports) { + Ontology ontology; + Consumer> consumeImports; + Map previousChecksums; + Map updatedChecksums; + List updatedOntologyIds; + List unchangedOntologyIds; + + public OntologyDownloaderThread(BulkOntologyDownloader downloader, + Ontology ontology, + Consumer> consumeImports, + Map previousChecksums, + Map updatedChecksums, + List updatedOntologyIds, + List unchangedOntologyIds) { super(); this.downloader = downloader; - this.ontologyUrl = ontologyUrl; + this.ontology = ontology; this.consumeImports = consumeImports; + this.previousChecksums = previousChecksums; + this.updatedChecksums = updatedChecksums; + this.updatedOntologyIds = updatedOntologyIds; + this.unchangedOntologyIds = unchangedOntologyIds; } @Override public void run() { - + String ontologyId = ontology.getId(); + String ontologyUrl = ontology.getUrl(); String path = downloader.downloadPath + "/" + urlToFilename(ontologyUrl); System.out.println(Thread.currentThread().getName() + " Starting download for " + ontologyUrl + " to " + path); - Set importUrls = new LinkedHashSet<>(); - long begin = System.nanoTime(); try { String mimetype = downloadURL(ontologyUrl, path); - Lang lang = RDFLanguages.contentTypeToLang(mimetype); - if(lang == null) { - lang = Lang.RDFXML; - } + String newChecksum = computeMD5Checksum(new File(path)); - // parse to look for imports only - createParser(lang).source(new FileInputStream(path)).parse(new StreamRDF() { - public void start() {} - public void quad(Quad quad) {} - public void base(String base) {} - public void prefix(String prefix, String iri) {} - public void finish() {} - public void triple(Triple triple) { - - if (triple.getPredicate().getURI().equals("http://www.w3.org/2002/07/owl#imports")) { - importUrls.add(triple.getObject().getURI()); - } - } - }); + String previousChecksum = previousChecksums.get(ontologyUrl); - } catch (Exception e) { + if (previousChecksum == null || !newChecksum.equals(previousChecksum)) { + // Ontology is new or has changed; process it + System.out.println("Processing updated ontology: " + ontologyUrl); + + // Parse ontology for imports + Set importOntologies = parseOntologyForImports(path, mimetype); + // Update the checksum map (synchronized for thread safety) + updatedChecksums.put(ontologyUrl, newChecksum); + + updatedOntologyIds.add(ontologyId); + + // Pass import URLs to the parent downloader + consumeImports.accept(importOntologies); + + } else { + // Ontology hasn't changed; skip processing + System.out.println("Skipping unchanged ontology: " + ontologyUrl); + unchangedOntologyIds.add(ontologyId); + } + + } catch (Exception e) { e.printStackTrace(); } long end = System.nanoTime(); System.out.println(Thread.currentThread().getName() + " Downloading and parsing for imports " + ontologyUrl + " took " + ((end-begin) / 1000 / 1000 / 1000) + "s"); - - consumeImports.accept(importUrls); } private String urlToFilename(String url) { @@ -138,5 +151,51 @@ private static String downloadURL(String url, String filename) throws FileNotFou } } + private String computeMD5Checksum(File file) throws NoSuchAlgorithmException { + try (InputStream fis = new FileInputStream(file)) { + MessageDigest md = MessageDigest.getInstance("MD5"); + byte[] buffer = new byte[1024]; + int bytesRead; + while ((bytesRead = fis.read(buffer)) != -1) { + md.update(buffer, 0, bytesRead); + } + byte[] digest = md.digest(); + // Convert the byte array to a hexadecimal string + StringBuilder sb = new StringBuilder(); + for (byte b : digest) { + sb.append(String.format("%02x", b & 0xff)); + } + return sb.toString(); + } catch (IOException | NoSuchAlgorithmException e) { + System.err.println("Error computing checksum for file " + file.getName() + ": " + e.getMessage()); + return null; + } + } + + private Set parseOntologyForImports(String path, String mimetype) throws FileNotFoundException { + Set importOntologies = new LinkedHashSet<>(); + + Lang lang = RDFLanguages.contentTypeToLang(mimetype); + if (lang == null) { + lang = Lang.RDFXML; + } + + createParser(lang).source(new FileInputStream(path)).parse(new StreamRDF() { + public void start() {} + public void quad(Quad quad) {} + public void base(String base) {} + public void prefix(String prefix, String iri) {} + public void finish() {} + public void triple(Triple triple) { + if (triple.getPredicate().getURI().equals("http://www.w3.org/2002/07/owl#imports")) { + String importUrl = triple.getObject().getURI(); + importOntologies.add(new Ontology(importUrl, importUrl)); + } + } + }); + + return importOntologies; + } + }