Skip to content

Commit

Permalink
lib: improve gene downloader by updating versions from config file, a…
Browse files Browse the repository at this point in the history
…dding log messages, removing hardcode filenames, sonnar issues,..., #TASK-5775, #TASK-5564
  • Loading branch information
jtarraga committed Mar 8, 2024
1 parent 1a5ba4a commit 4cdd046
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 173 deletions.
15 changes: 6 additions & 9 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ download:
version: "2021-03-30"
geneUniprotXref:
host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/
version: "2023-11-08"
version: "2024_01 (24-Jan-2024)"
geneExpressionAtlas:
host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz
host: https://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz
version: "2.0.14"
goAnnotation:
host: http://geneontology.org/gene-associations/goa_human.gaf.gz
Expand Down Expand Up @@ -171,14 +171,11 @@ download:
host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv
version: "2024-02-12"
hpo:
## NOTE: Download manually from here now: https://hpo.jax.org/app/data/annotations
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
version: "2024-03-01"
## NOTE: Download manually from here now
host: https://hpo.jax.org/app/data/annotations
disgenet:
host: https://www.disgenet.org/static/disgenet_ap1/files/downloads
files:
- all_gene_disease_associations.tsv.gz
- readme.txt
host: https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz
version: "7.0 (January 2020)"
dgidb:
host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv
version: "2022-02-01"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,29 @@
*/
public class EtlCommons {

public static final String SUFFIX_VERSION_FILENAME = "Version.json";

public static final String GENOME_DATA = "genome";

public static final String GENE_DATA = "gene";
public static final String ENSEMBL_CORE_VERSION_FILENAME = "ensemblCore" + SUFFIX_VERSION_FILENAME;
public static final String MANE_SELECT_VERSION_FILENAME = "maneSelect" + SUFFIX_VERSION_FILENAME;
public static final String LRG_VERSION_FILENAME = "lrg" + SUFFIX_VERSION_FILENAME;
public static final String HGNC_VERSION_FILENAME = "hgnc" + SUFFIX_VERSION_FILENAME;
public static final String CANCER_HOTSPOT_VERSION_FILENAME = "cancerHotspot" + SUFFIX_VERSION_FILENAME;
public static final String GO_ANNOTATION_VERSION_FILENAME = "goAnnotation" + SUFFIX_VERSION_FILENAME;
public static final String GNOMAD_VERSION_FILENAME = "gnomad" + SUFFIX_VERSION_FILENAME;
public static final String DGIDB_VERSION_FILENAME = "dgidb" + SUFFIX_VERSION_FILENAME;
public static final String UNIPROT_XREF_VERSION_FILENAME = "uniprotXref" + SUFFIX_VERSION_FILENAME;
public static final String GENE_EXPRESSION_ATLAS_VERSION_FILENAME = "geneExpressionAtlas" + SUFFIX_VERSION_FILENAME;
public static final String HPO_VERSION_FILENAME = "hpo" + SUFFIX_VERSION_FILENAME;
public static final String DISGINET_VERSION_FILENAME = "disgenet" + SUFFIX_VERSION_FILENAME;

public static final String REFSEQ_DATA = "refseq";
public static final String REFSEQ_VERSION_FILENAME = REFSEQ_DATA + SUFFIX_VERSION_FILENAME;
public static final String REFSEQ_ASTA_VERSION_FILENAME = REFSEQ_DATA + "Fasta" + SUFFIX_VERSION_FILENAME;
public static final String REFSEQ_PROTEIN_FASTA_VERSION_FILENAME = REFSEQ_DATA + "ProteinFasta" + SUFFIX_VERSION_FILENAME;
public static final String REFSEQ_CDNA_FASTA_VERSION_FILENAME = REFSEQ_DATA + "CdnaFasta" + SUFFIX_VERSION_FILENAME;
public static final String GENE_DISEASE_ASSOCIATION_DATA = "gene_disease_association";
public static final String VARIATION_DATA = "variation";
public static final String VARIATION_FUNCTIONAL_SCORE_DATA = "variation_functional_score";
Expand All @@ -51,7 +71,7 @@ public class EtlCommons {
public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics";
public static final String PHARMGKB_NAME = "PharmGKB";
public static final String PHARMGKB_DATA = "pharmgkb";
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";
public static final String PHARMGKB_VERSION_FILENAME = PHARMGKB_DATA + SUFFIX_VERSION_FILENAME;

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
@Deprecated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,12 @@ protected void saveVersionData(String data, String name, String version, String
Map<String, Object> versionDataMap = new HashMap<>();
versionDataMap.put("data", data);
versionDataMap.put("name", name);
versionDataMap.put("version", version);
if (StringUtils.isEmpty(version)) {
logger.warn("Version missing for data source {}/{}, using the date as version: {}", data, name, date);
versionDataMap.put("version", date);
} else {
versionDataMap.put("version", version);
}
versionDataMap.put("date", date);
versionDataMap.put("url", url);

Expand Down Expand Up @@ -291,6 +296,7 @@ private long getExpectedFileSize(String outputFileLog) {
return -1;
}

@Deprecated
protected String getVersionFromVersionLine(Path path, String tag) {
Files.exists(path);
try {
Expand Down Expand Up @@ -326,6 +332,10 @@ private String getEnsemblURL(SpeciesConfiguration sp) {
}
return ensemblHostUrl;
}

protected String getUrlFilename(String url) {
return Paths.get(url).getFileName().toString();
}
}


Loading

0 comments on commit 4cdd046

Please sign in to comment.