From ce1767a53a30eb019f4cf77e1680c0014cf1429b Mon Sep 17 00:00:00 2001 From: imedina Date: Tue, 2 Jan 2024 02:33:13 +0000 Subject: [PATCH] downloader: add cancer hotspot --- .../cellbase/core/config/DownloadProperties.java | 10 ++++++++++ .../src/main/resources/configuration.yml | 4 ++++ .../lib/download/GeneDownloadManager.java | 15 ++++++++++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index ee4216f56..a897625ef 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -26,6 +26,7 @@ public class DownloadProperties { private EnsemblProperties ensembl; private EnsemblProperties ensemblGenomes; private URLProperties hgnc; + private URLProperties cancerHotspot; private URLProperties refSeq; private URLProperties refSeqFasta; private URLProperties refSeqProteinFasta; @@ -517,6 +518,15 @@ public DownloadProperties setHgnc(URLProperties hgnc) { return this; } + public URLProperties getCancerHotspot() { + return cancerHotspot; + } + + public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) { + this.cancerHotspot = cancerHotspot; + return this; + } + public static class EnsemblProperties { private DatabaseCredentials database; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 172f950f5..7a5b25ea6 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -64,6 +64,9 @@ download: hgnc: host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt version: 2023-11-01 + cancerHotspot: + host: https://www.cancerhotspots.org/files/hotspots_v2.xls + version: "v2" refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz refSeqFasta: @@ -161,6 +164,7 @@ download: host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv version: "2022-02-01" cadd: + ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz reactome: host: http://www.reactome.org/download/current/biopax.zip diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 9bd82a951..260ff7542 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -83,6 +83,7 @@ public List download() throws IOException, InterruptedException { downloadFiles.add(downloadMane(geneFolder)); downloadFiles.add(downloadLrg(geneFolder)); downloadFiles.add(downloadHgnc(geneFolder)); + downloadFiles.add(downloadCancerHotspot(geneFolder)); downloadFiles.add(downloadDrugData(geneFolder)); downloadFiles.addAll(downloadGeneUniprotXref(geneFolder)); downloadFiles.add(downloadGeneExpressionAtlas(geneFolder)); @@ -211,7 +212,7 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading LRG ..."); + logger.info("Downloading HGNC ..."); String url = configuration.getDownload().getHgnc().getHost(); saveVersionData(EtlCommons.GENE_DATA, "HGNC_GENE", configuration.getDownload().getHgnc().getVersion(), getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("hgncVersion.json")); @@ -221,6 +222,18 @@ private DownloadFile downloadHgnc(Path geneFolder) throws IOException, Interrupt return null; } + private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException { + if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { + logger.info("Downloading Cancer Hotspot ..."); + String url = configuration.getDownload().getCancerHotspot().getHost(); + saveVersionData(EtlCommons.GENE_DATA, "CANCER_HOTSPOT", configuration.getDownload().getHgnc().getVersion(), + getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("cancerHotspotVersion.json")); + String[] array = url.split("/"); + return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString()); + } + return null; + } + private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { logger.info("Downloading go annotation...");