Skip to content

Commit

Permalink
Merge branch 'TASK-5564' of https://github.com/opencb/cellbase into T…
Browse files Browse the repository at this point in the history
…ASK-5564
  • Loading branch information
jtarraga committed Jul 8, 2024
2 parents 5eb33ae + 3dcad47 commit f44dcc5
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ public void execute() throws CellBaseException {
case REGULATION_DATA:
downloader = new RegulationDownloadManager(species, assembly, outputDirectory, configuration);
break;
case VARIATION_DATA:
downloader = new VariationDownloadManager(species, assembly, outputDirectory, configuration);
break;
case VARIATION_FUNCTIONAL_SCORE_DATA:
downloader = new CaddDownloadManager(species, assembly, outputDirectory, configuration);
break;
Expand Down
3 changes: 2 additions & 1 deletion cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ species:
- gene
- protein
- regulation
- variation
- variation_functional_score
- missense_variation_functional_score
- clinical_variant
Expand All @@ -355,7 +356,7 @@ species:
- gene
- regulation
- protein
# - variation
- variation
- id: rnorvegicus
scientificName: Rattus norvegicus
assemblies:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ public final class EtlCommons {
dataNamesMap.put(SPLICE_SCORE_DATA, "Splice Score");
dataNamesMap.put(MMSPLICE_DATA, "MMSplice");
dataNamesMap.put(SPLICEAI_DATA, "SpliceAI");
dataNamesMap.put(VARIATION_DATA, "Variation");


// Populate data categories map
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ public List<DownloadFile> download() throws IOException, InterruptedException, C
// Ensembl
downloadFiles.addAll(downloadEnsemblData(ensemblDownloadPath));

// Ensembl canonical
downloadEnsemblCanonical();

// RefSeq
downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath));

Expand All @@ -87,25 +90,24 @@ public List<DownloadFile> download() throws IOException, InterruptedException, C
downloadFiles.add(downloadGO(geneDownloadPath));
logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA));

downloadEnsemblCanonical();

// Save data sources manually downloaded
// HPO
saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA),
getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath);

// Cancer gene census
saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA),
getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath);
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
// HPO
saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA),
getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath);

// Cancer gene census
saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA),
getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath);
}

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA));

return downloadFiles;
}

Expand Down Expand Up @@ -210,8 +212,7 @@ private DownloadFile downloadLrg(Path geneDownloadPath) throws IOException, Inte
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(LRG_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA,
geneDownloadPath);
downloadFile = downloadAndSaveDataSource(configuration.getDownload().getLrg(), LRG_FILE_ID, LRG_DATA, geneDownloadPath);

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(LRG_DATA));
}
Expand All @@ -225,8 +226,7 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA,
geneDownloadPath);
downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath);

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(HGNC_DATA));
}
Expand Down Expand Up @@ -255,8 +255,7 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException,
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA,
geneDownloadPath);
downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath);

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(DGIDB_DATA));
}
Expand All @@ -267,10 +266,10 @@ private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOExc
DownloadFile downloadFile = null;

// Check if the species is supported
if (GENE_UNIPROT_XREF_FILES.containsKey(speciesConfiguration.getScientificName())) {
String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA));

String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(),
prefixId + UNIPROT_XREF_FILE_ID, UNIPROT_XREF_DATA, geneDownloadPath);

Expand Down Expand Up @@ -329,11 +328,10 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
|| speciesConfiguration.getScientificName().equals(MUS_MUSCULUS_NAME)) {
String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA));

String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(),
prefixId + GO_ANNOTATION_FILE_ID, GO_ANNOTATION_DATA, geneDownloadPath);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.lib.download;

import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.utils.SpeciesUtils;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.*;

public class VariationDownloadManager extends AbstractDownloadManager {

public VariationDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
super(species, assembly, targetDirectory, configuration);
}

@Override
public List<DownloadFile> download() throws IOException, InterruptedException, CellBaseException {
return downloadVariation();
}

public List<DownloadFile> downloadVariation() throws IOException, InterruptedException, CellBaseException {
List<DownloadFile> downloadFiles = new ArrayList<>();

// Check if species is supported
if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA));

Path variationFolder = downloadFolder.resolve(VARIATION_DATA);
Files.createDirectories(variationFolder);

// We do not need to download human variation data from Ensembl. It is already included in the CellBase.
if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, speciesShortName + ".vcf.gz");
String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString();
String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/"
+ speciesShortName + ".vcf.gz";
downloadFiles.add(downloadFile(url, fileName));
logger.info(OK_LOG_MESSAGE);
saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url),
variationFolder.resolve(getDataVersionFilename(VARIATION_DATA)));

fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString();
url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/"
+ speciesShortName + "_structural_variations.vcf.gz";
downloadFiles.add(downloadFile(url, fileName));
logger.info(OK_LOG_MESSAGE);
saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url),
variationFolder.resolve(getDataVersionFilename(VARIATION_DATA)));
}
logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(VARIATION_DATA));
}

return downloadFiles;
}
}

0 comments on commit f44dcc5

Please sign in to comment.