Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Exomiser resources 2402 or newer #661

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ private void getClinvarPathScores() {
continue; // should almost never happen
}
float pathogenicity = calculatePathogenicity(variantEffect, pathogenicityData);
ClinVarData clinVarData = pathogenicityData.getClinVarData();
ClinVarData clinVarData = pathogenicityData.clinVarData();
// ClinVar have three 'pathogenic' significance values - pathogenic, pathogenic_or_likely_pathogenic and likely_pathogenic
// they also have a review status which will tell you how much confidence you might want to assign a given interpretation.
// see https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
Expand Down Expand Up @@ -250,60 +250,60 @@ private void binPathogenicityData() {
.map(TermId::getValue)
.orElse("UNKNOWN");

Frequency afr = frequencyData.getFrequencyForSource(GNOMAD_E_AFR);
Frequency afr = frequencyData.frequency(GNOMAD_E_AFR);
if (afr==null) {
afr = frequencyData.getFrequencyForSource(GNOMAD_G_AFR);
afr = frequencyData.frequency(GNOMAD_G_AFR);
}
if (afr!=null) {
float frequencyAsPercentage = afr.getFrequency();
float frequencyAsPercentage = afr.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_AFR);
}
Frequency amr = frequencyData.getFrequencyForSource(GNOMAD_E_AMR);
Frequency amr = frequencyData.frequency(GNOMAD_E_AMR);
if (amr==null) {
amr=frequencyData.getFrequencyForSource(GNOMAD_G_AMR);
amr=frequencyData.frequency(GNOMAD_G_AMR);
}
if (amr!=null) {
float frequencyAsPercentage = amr.getFrequency();
float frequencyAsPercentage = amr.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_AMR);
}
Frequency asj = frequencyData.getFrequencyForSource(GNOMAD_E_ASJ);
Frequency asj = frequencyData.frequency(GNOMAD_E_ASJ);
if (asj==null) {
asj=frequencyData.getFrequencyForSource(GNOMAD_G_ASJ);
asj=frequencyData.frequency(GNOMAD_G_ASJ);
}
if (asj!=null) {
float frequencyAsPercentage = asj.getFrequency();
float frequencyAsPercentage = asj.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_ASJ);
}
Frequency eas = frequencyData.getFrequencyForSource(GNOMAD_E_EAS);
Frequency eas = frequencyData.frequency(GNOMAD_E_EAS);
if (eas==null) {
eas=frequencyData.getFrequencyForSource(GNOMAD_G_EAS);
eas=frequencyData.frequency(GNOMAD_G_EAS);
}
if (eas!=null) {
float frequencyAsPercentage = eas.getFrequency();
float frequencyAsPercentage = eas.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_EAS);
}
Frequency fin = frequencyData.getFrequencyForSource(GNOMAD_E_FIN);
Frequency fin = frequencyData.frequency(GNOMAD_E_FIN);
if (fin==null) {
fin= frequencyData.getFrequencyForSource(GNOMAD_G_FIN);
fin= frequencyData.frequency(GNOMAD_G_FIN);
}
if (fin!=null) {
float frequencyAsPercentage = fin.getFrequency();
float frequencyAsPercentage = fin.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_FIN);
}
Frequency nfe = frequencyData.getFrequencyForSource(GNOMAD_E_NFE);
Frequency nfe = frequencyData.frequency(GNOMAD_E_NFE);
if (nfe==null) {
nfe = frequencyData.getFrequencyForSource(GNOMAD_G_NFE);
nfe = frequencyData.frequency(GNOMAD_G_NFE);
}
if (nfe!=null) {
float frequencyAsPercentage = nfe.getFrequency();
float frequencyAsPercentage = nfe.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_NFE);
}
Frequency sas = frequencyData.getFrequencyForSource(GNOMAD_E_SAS);
Frequency sas = frequencyData.frequency(GNOMAD_E_SAS);
if (sas==null) {
sas= frequencyData.getFrequencyForSource(GNOMAD_G_SAS);
sas= frequencyData.frequency(GNOMAD_G_SAS);
}
if (sas!=null) {
float frequencyAsPercentage = sas.getFrequency();
float frequencyAsPercentage = sas.frequency();
addToBin(genesymbol, id, frequencyAsPercentage, pathogenicity, GNOMAD_E_SAS);
}
if (c++%100_000==0) {
Expand All @@ -330,8 +330,8 @@ private Optional<GenomicVariant> prepareGenomicVariant(AlleleProto.AlleleKey all
* @return the predicted pathogenicity score.
*/
private float calculatePathogenicity(VariantEffect variantEffect, PathogenicityData pathogenicityData) {
float predictedScore = pathogenicityData.getScore();
float variantEffectScore = VariantEffectPathogenicityScore.getPathogenicityScoreOf(variantEffect);
float predictedScore = pathogenicityData.pathogenicityScore();
float variantEffectScore = VariantEffectPathogenicityScore.pathogenicityScoreOf(variantEffect);
switch (variantEffect) {
case MISSENSE_VARIANT:
return pathogenicityData.hasPredictedScore() ? predictedScore : variantEffectScore;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.monarchinitiative.lirical.core.model.LiricalVariant;
import org.monarchinitiative.lirical.core.model.TranscriptDatabase;
import org.monarchinitiative.lirical.core.sanitize.*;
import org.monarchinitiative.lirical.exomiser_db_adapter.ExomiserResources;
import org.monarchinitiative.lirical.io.LiricalDataException;
import org.monarchinitiative.lirical.io.background.CustomBackgroundVariantFrequencyServiceFactory;
import org.monarchinitiative.phenol.annotations.io.hpo.DiseaseDatabase;
Expand All @@ -26,6 +27,8 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
Expand All @@ -34,6 +37,8 @@
abstract class LiricalConfigurationCommand extends BaseCommand {

private static final Logger LOGGER = LoggerFactory.getLogger(LiricalConfigurationCommand.class);
private static final Pattern EXOMISER_VARIANT_DB = Pattern.compile("(?<date>\\d{4})_(?<build>(hg19|hg38))_variants\\.mv\\.db", Pattern.CASE_INSENSITIVE);
private static final Pattern EXOMISER_CLINVAR_DB = Pattern.compile("(?<date>\\d{4})_(?<build>(hg19|hg38))_clinvar\\.mv\\.db", Pattern.CASE_INSENSITIVE);
protected static final String UNKNOWN_VERSION_PLACEHOLDER = "UNKNOWN VERSION";

// ---------------------------------------------- RESOURCES --------------------------------------------------------
Expand All @@ -46,23 +51,39 @@ public static class DataSection {
description = "Path to Lirical data directory.")
public Path liricalDataDirectory = null;

@CommandLine.Option(names = {"-e19", "--exomiser-hg19"},
@CommandLine.Option(names = {"-ed19", "--exomiser-hg19-dir"},
description = "Path to the Exomiser data directory for hg19.")
public Path exomiserHg19DataDirectory = null;

@CommandLine.Option(names = {"-e19", "--exomiser-hg19", "--exomiser-hg19-variants"},
description = "Path to the Exomiser variant database for hg19.")
public Path exomiserHg19Database = null;

@CommandLine.Option(names = {"-e38", "--exomiser-hg38"},
@CommandLine.Option(names = {"-c19", "--exomiser-hg19-clinvar"},
description = "Path to the Exomiser clinvar database for hg19.")
public Path exomiserHg19ClinvarDatabase = null;

@CommandLine.Option(names = {"-ed38", "--exomiser-hg38-dir"},
description = "Path to the Exomiser data directory for hg38.")
public Path exomiserHg38DataDirectory = null;

@CommandLine.Option(names = {"-e38", "--exomiser-hg38", "--exomiser-hg38-variants"},
description = "Path to the Exomiser variant database for hg38.")
public Path exomiserHg38Database = null;

@CommandLine.Option(names = {"-c38", "--exomiser-hg38-clinvar"},
description = "Path to the Exomiser clinvar database for hg38.")
public Path exomiserHg38ClinvarDatabase = null;

@CommandLine.Option(names = {"-b", "--background"},
description = "Path to non-default background frequency file.")
public Path backgroundFrequencyFile = null;

@CommandLine.Option(names = "--parallelism",
description = {
"The number of workers/threads to use.",
"The value must be a positive integer.",
"Default: ${DEFAULT-VALUE}"})
description = {
"The number of workers/threads to use.",
"The value must be a positive integer.",
"Default: ${DEFAULT-VALUE}"})
public int parallelism = 1;
}

Expand All @@ -82,7 +103,7 @@ public static class RunConfiguration {

@CommandLine.Option(names = {"--sdwndv"},
description = {
"Include diseases even if no deleterious variants are found in the gene associated with the disease.",
"Include diseases even if no deleterious variants are found in the gene associated with the disease.",
"Only applicable to the HTML report when running with a VCF file (genotype-aware mode).",
"(default: ${DEFAULT-VALUE})"
})
Expand All @@ -104,7 +125,7 @@ public static class RunConfiguration {

@CommandLine.Option(names = {"--variant-background-frequency"},
description = {
"Default frequency of called-pathogenic variants in the general population (gnomAD).",
"Default frequency of called-pathogenic variants in the general population (gnomAD).",
"In the vast majority of cases, we can derive this information from gnomAD.",
"This constant is used if for whatever reason, data was not available for a gene.",
"(default: ${DEFAULT-VALUE})."})
Expand All @@ -119,7 +140,7 @@ public static class RunConfiguration {
description = {"Validation policy for the analysis", "(default: ${DEFAULT-VALUE})."})
public ValidationPolicy validationPolicy = ValidationPolicy.MINIMAL;

@CommandLine.Option(names ={"--dry-run"},
@CommandLine.Option(names = {"--dry-run"},
description = {
"Validate the input, report potential issues, and exit without running the analysis.",
"(default ${DEFAULT-VALUE})"
Expand All @@ -128,7 +149,7 @@ public static class RunConfiguration {
}

protected List<String> checkInput() {
List<String> errors = new LinkedList<>();
List<String> errors = new ArrayList<>();

Path codeHomeParent = codeHomeDir();
// resources
Expand All @@ -153,18 +174,20 @@ protected List<String> checkInput() {
String msg = "Genome build must be set";
errors.add(msg);
} else {
// Check Exomiser db seem to match the genome build.
// Check Exomiser resources match the genome build.
switch (genomeBuild.get()) {
case HG19 -> {
if (dataSection.exomiserHg19Database == null && dataSection.exomiserHg38Database != null) {
String msg = "Genome build set to %s but Exomiser variant database is set for %s: %s".formatted(GenomeBuild.HG19, GenomeBuild.HG38, dataSection.exomiserHg38Database.toAbsolutePath());
errors.add(msg);
if (dataSection.exomiserHg19DataDirectory == null
&& (dataSection.exomiserHg19Database == null || dataSection.exomiserHg19ClinvarDatabase == null)
) {
errors.add("Genome build set to HG19 but Exomiser resources are unset");
}
}
case HG38 -> {
if (dataSection.exomiserHg38Database == null && dataSection.exomiserHg19Database != null) {
String msg = "Genome build set to %s but Exomiser variant database is set for %s: %s".formatted(GenomeBuild.HG38, GenomeBuild.HG19, dataSection.exomiserHg19Database.toAbsolutePath());
errors.add(msg);
if (dataSection.exomiserHg38DataDirectory == null
&& (dataSection.exomiserHg38Database == null || dataSection.exomiserHg38ClinvarDatabase == null)
) {
errors.add("Genome build set to HG38 but Exomiser resources are unset");
}
}
}
Expand All @@ -184,17 +207,16 @@ protected List<String> checkInput() {
protected Lirical bootstrapLirical(GenomeBuild genomeBuild) throws LiricalDataException {
LiricalBuilder builder = LiricalBuilder.builder(dataSection.liricalDataDirectory);

switch (genomeBuild) {
case HG19 -> {
if (dataSection.exomiserHg19Database != null)
builder.exomiserVariantDbPath(GenomeBuild.HG19, dataSection.exomiserHg19Database);
}
case HG38 -> {
if (dataSection.exomiserHg38Database != null)
builder.exomiserVariantDbPath(GenomeBuild.HG38, dataSection.exomiserHg38Database);
}
}
// Deal with Exomiser resources
ExomiserResources resources = loadExomiserResources(
genomeBuild,
dataSection.exomiserHg19DataDirectory,
dataSection.exomiserHg19Database,
dataSection.exomiserHg19ClinvarDatabase
);
builder.exomiserResources(genomeBuild, resources);

// Background variant frequencies
if (dataSection.backgroundFrequencyFile != null) {
LOGGER.debug("Using custom deleterious variant background frequency file at {} for {}",
dataSection.backgroundFrequencyFile.toAbsolutePath(),
Expand All @@ -208,6 +230,84 @@ protected Lirical bootstrapLirical(GenomeBuild genomeBuild) throws LiricalDataEx
.build();
}

private static ExomiserResources loadExomiserResources(
GenomeBuild genomeBuild,
Path exomiserDataDirectory,
Path exoimserAlleleDatabasePath,
Path exomiserClinvarDatabasePath
) {
// At the end of the function, we need to possess path to allele DB and Clinvar DB files.
//
Path exomiserAlleleDb = null;
Path exomiserClinvarDb = null;
if (exomiserDataDirectory != null) {
if (Files.isDirectory(exomiserDataDirectory)) {
LOGGER.debug("Using exomiser data directory at {}", exomiserDataDirectory.toAbsolutePath());

// We check that the path is a directory, hence no inspection.
// This can still fail in case of an IO error.
//noinspection DataFlowIssue
for (String file : exomiserDataDirectory.toFile().list()) {
Matcher alleleMatcher = EXOMISER_VARIANT_DB.matcher(file);
if (alleleMatcher.matches()) {
Path path = exomiserDataDirectory.resolve(file);

String build = alleleMatcher.group("build");
if (!genomeBuild.name().equalsIgnoreCase(build)) {
LOGGER.warn(
"Genome build {} is in use but Exomiser variant database seems to be for {}: {}",
genomeBuild, build, path.toAbsolutePath()
);
}
LOGGER.debug("Using exomiser variant database at {}", path.toAbsolutePath());
exomiserAlleleDb = path;
continue;
}

Matcher clinvarMatcher = EXOMISER_CLINVAR_DB.matcher(file);
if (clinvarMatcher.matches()) {
Path path = exomiserDataDirectory.resolve(file);
String build = clinvarMatcher.group("build");
if (!genomeBuild.name().equalsIgnoreCase(build)) {
LOGGER.warn(
"Genome build {} is in use but Exomiser clinvar database seems to be for {}: {}",
genomeBuild, build, path.toAbsolutePath()
);
}
LOGGER.debug("Using exomiser clinvar database at {}", path.toAbsolutePath());
exomiserClinvarDb = path;
}

if (exomiserAlleleDb != null && exomiserClinvarDb != null)
// We found both allele and clinvar database files
break;
}
}
}

// Providing explicit database paths overrides paths from the Exomiser data directory
if (exoimserAlleleDatabasePath != null) {
if (exomiserAlleleDb != null) {
LOGGER.debug("Using {} instead of {} due to CLI option override",
exoimserAlleleDatabasePath.toAbsolutePath(), exomiserAlleleDb);
} else {
LOGGER.debug("Using exomiser variant database at {}", exoimserAlleleDatabasePath.toAbsolutePath());
}
exomiserAlleleDb = exoimserAlleleDatabasePath;
}
if (exomiserClinvarDatabasePath != null) {
if (exomiserClinvarDb != null) {
LOGGER.debug("Using {} instead of {} due to CLI option override",
exomiserClinvarDatabasePath.toAbsolutePath(), exomiserAlleleDb);
} else {
LOGGER.debug("Using exomiser clinvar database at {}", exomiserClinvarDatabasePath.toAbsolutePath());
}
exomiserClinvarDb = exomiserClinvarDatabasePath;
}

return new ExomiserResources(exomiserAlleleDb, exomiserClinvarDb);
}

protected abstract String getGenomeBuild();

protected GenomeBuild parseGenomeBuild(String genomeBuild) throws LiricalDataException {
Expand Down Expand Up @@ -305,7 +405,7 @@ protected static SampleIdAndGenesAndGenotypes readVariantsFromVcfFile(String sam
* {@code null}, we can only accept a single-sample VCF, mainly as a convenience.
*
* @throws LiricalParseException if the VCF includes no sample data, the sample is not present,
* or it is a multi-sample file and the sample ID is unset.
* or it is a multi-sample file and the sample ID is unset.
*/
private static String validateSampleId(String sampleId,
Path vcfPath,
Expand Down
Loading
Loading