Skip to content

Commit

Permalink
Update ontology downloading logic
Browse files Browse the repository at this point in the history
  • Loading branch information
haideriqbal committed Nov 11, 2024
1 parent 5e094c6 commit 376e3c5
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 84 deletions.
Original file line number Diff line number Diff line change
@@ -1,79 +1,136 @@

package uk.ac.ebi.ols4.predownloader;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.LinkedHashSet;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;

import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;

public class BulkOntologyDownloader {

static final int NUM_THREADS = 16;

Set<String> urlsToDownload;
Set<String> urlsAlreadyProcessed;
List<Ontology> ontologiesToDownload;
private Set<String> ontologyIdsAlreadyProcessed;
String downloadPath;
boolean loadLocalFiles;
List<String> updatedOntologyIds;
List<String> unchangedOntologyIds;
private Map<String, String> previousChecksums;
private Map<String, String> updatedChecksums;

Set<OntologyDownloaderThread> threads = new HashSet<>();

public BulkOntologyDownloader(List<String> ontologyUrls, String downloadPath, boolean loadLocalFiles) {
this.urlsToDownload = new LinkedHashSet<String>(ontologyUrls);
this.urlsAlreadyProcessed = new HashSet<>();
public BulkOntologyDownloader(List<Ontology> ontologies,
String downloadPath,
boolean loadLocalFiles,
Map<String, String> previousChecksums) {
this.ontologiesToDownload = new ArrayList<>(ontologies);
this.ontologyIdsAlreadyProcessed = Collections.synchronizedSet(new HashSet<>());
this.downloadPath = downloadPath;
this.loadLocalFiles = loadLocalFiles;
this.previousChecksums = previousChecksums;
this.updatedChecksums = new ConcurrentHashMap<>();
this.updatedOntologyIds = Collections.synchronizedList(new ArrayList<>());
this.unchangedOntologyIds = Collections.synchronizedList(new ArrayList<>());
}

public void downloadAll() {

while(urlsToDownload.size() > 0) {
while (!ontologiesToDownload.isEmpty()) {

List<Thread> threads = new ArrayList<>();
Set<String> imports = new LinkedHashSet<>();
Set<Ontology> imports = new LinkedHashSet<>();

for(int i = 0; i < NUM_THREADS; ++ i) {

if(urlsToDownload.size() == 0) {
if (ontologiesToDownload.isEmpty()) {
break;
}

Iterator<String> it = urlsToDownload.iterator();
String nextUrl = it.next();
it.remove();
Ontology ontology = ontologiesToDownload.remove(0);

// Check if we've already processed this ontology ID
if (ontologyIdsAlreadyProcessed.contains(ontology.getId())) {
continue;
}

urlsAlreadyProcessed.add(nextUrl);
ontologyIdsAlreadyProcessed.add(ontology.getId());

OntologyDownloaderThread downloaderThread = new OntologyDownloaderThread(
this,
ontology,
importedOntologies -> {
synchronized (imports) {
imports.addAll(importedOntologies);
}
},
previousChecksums,
updatedChecksums,
updatedOntologyIds,
unchangedOntologyIds
);


Thread thread = new Thread(downloaderThread, "Downloader thread " + i);
threads.add(thread);

thread.start();

for (Thread t : threads) {
try {
t.join();
System.out.println(t.getName() + " finished");
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

OntologyDownloaderThread downloader =
new OntologyDownloaderThread(this, nextUrl, importUrls -> {
imports.addAll(importUrls);
});
synchronized (ontologiesToDownload) {
for (Ontology importedOntology : imports) {
if (!ontologyIdsAlreadyProcessed.contains(importedOntology.getId())) {
ontologiesToDownload.add(importedOntology);
}
}
}
}

Thread t = new Thread(downloader, "Downloader thread " + i);
threads.add(t);
saveChecksums(updatedChecksums);
}

t.start();
private void saveChecksums(Map<String, String> checksums) {
try (Writer writer = new FileWriter("checksums.json")) {
Gson gson = new GsonBuilder().setPrettyPrinting().create();
gson.toJson(checksums, writer);
} catch (IOException e) {
System.err.println("Error writing checksums.json: " + e.getMessage());
}
}

for(Thread t : threads) {
try {
t.join();
System.out.println(t.getName() + " finished");
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
private void printUpdateSummary() {
System.out.println("\nUpdate Summary:");
System.out.println("Total ontologies processed: " + (updatedOntologyIds.size() + unchangedOntologyIds.size()));
System.out.println("Ontologies updated: " + updatedOntologyIds.size());
System.out.println("Ontologies unchanged: " + unchangedOntologyIds.size());

if (!updatedOntologyIds.isEmpty()) {
System.out.println("\nUpdated Ontologies:");
for (String id : updatedOntologyIds) {
System.out.println(" - " + id);
}
}

for(String importUrl : imports) {
if(!urlsAlreadyProcessed.contains(importUrl)) {
urlsAlreadyProcessed.add(importUrl);
urlsToDownload.add(importUrl);
if (!unchangedOntologyIds.isEmpty()) {
System.out.println("\nUnchanged Ontologies:");
for (String id : unchangedOntologyIds) {
System.out.println(" - " + id);
}
}
}

}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package uk.ac.ebi.ols4.predownloader;

import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import com.google.gson.stream.JsonReader;
import com.google.gson.stream.JsonToken;
Expand All @@ -8,16 +9,9 @@
import org.apache.commons.cli.*;

import java.io.*;
import java.lang.reflect.Type;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.*;
import java.util.stream.Collectors;

public class Downloader {
Expand Down Expand Up @@ -83,6 +77,18 @@ public static void main(String[] args) throws IOException {

}).collect(Collectors.toList());

Map<String, String> previousChecksums = new HashMap<>();
File checksumFile = new File("checksums.json");
if (checksumFile.exists()) {
try (Reader reader = new FileReader(checksumFile)) {
Type type = new TypeToken<Map<String, String>>() {}.getType();
previousChecksums = gson.fromJson(reader, type);
} catch (IOException e) {
System.err.println("Error reading checksums.json: " + e.getMessage());
}
}



LinkedHashMap<String, Map<String,Object>> mergedConfigs = new LinkedHashMap<>();

Expand All @@ -108,9 +114,11 @@ public static void main(String[] args) throws IOException {


Set<String> ontologyUrls = new LinkedHashSet<>();
List<Ontology> ontologyList = new ArrayList<>();

for(Map<String,Object> config : mergedConfigs.values()) {

String ontologyId = ((String) config.get("id")).toLowerCase();
String url = (String) config.get("ontology_purl");

if(url == null) {
Expand All @@ -132,12 +140,13 @@ public static void main(String[] args) throws IOException {
}
}

if(url != null)
ontologyUrls.add(url);
if (url != null) {
ontologyList.add(new Ontology(ontologyId, url));
}
}


BulkOntologyDownloader downloader = new BulkOntologyDownloader(List.copyOf(ontologyUrls), downloadPath, bLoadLocalFiles);
BulkOntologyDownloader downloader = new BulkOntologyDownloader(ontologyList, downloadPath, bLoadLocalFiles, previousChecksums);

downloader.downloadAll();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package uk.ac.ebi.ols4.predownloader;

public class Ontology {
private String id;
private String url;

public Ontology(String id, String url) {
this.id = id;
this.url = url;
}

public String getId() {
return id;
}

public String getUrl() {
return url;
}
}

Loading

0 comments on commit 376e3c5

Please sign in to comment.