Merge pull request #669 from TheJacksonLaboratory/release-2.0.4

Make release 2.0.4
TheJacksonLaboratory · Jan 31, 2025 · d9b4c64 · d9b4c64
2 parents 7cd5019 + 5f03369
commit d9b4c64
Show file tree

Hide file tree

Showing 14 changed files with 140 additions and 35 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -56,7 +56,7 @@
 # The short X.Y version.
 version = u'2.0'
 # The full version, including alpha/beta/rc tags.
-release = u'2.0.3'
+release = u'2.0.4'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/running.rst b/docs/running.rst
@@ -80,6 +80,10 @@ The configuration options tweak the analysis.
   The JSON report will include *all* diseases all the time.
 * ``--transcript-db``: transcript database (default: ``RefSeq``), see :ref:`rsttx-dbs` for more info.
 * ``--use-orphanet``: use `Orphanet <https://www.orpha.net/consor/cgi-bin/index.php>`_ annotations (default: ``false``).
+* ``--target-diseases``: limit the analysis to the provided disease IDs.
+  Expecting a comma-separated list of diseaes IDs, such as `OMIM:614102,OMIM:619340`.
+  The ``--use-orphanet`` option is ignored if at least one disease ID is provided.
+  All diseases are analyzed by default.
 * ``--strict``: use strict penalties if the genotype does not match the disease model
   in terms of number of called pathogenic alleles (default: ``false``).
 * ``--pathogenicity-threshold``: Variants with greater pathogenicity score is considered deleterious (default: ``0.8``).

diff --git a/lirical-background/pom.xml b/lirical-background/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>LIRICAL</artifactId>
         <groupId>org.monarchinitiative.lirical</groupId>
-        <version>2.0.3</version>
+        <version>2.0.4</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 

diff --git a/lirical-cli/pom.xml b/lirical-cli/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>LIRICAL</artifactId>
         <groupId>org.monarchinitiative.lirical</groupId>
-        <version>2.0.3</version>
+        <version>2.0.4</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 

diff --git a/...-cli/src/main/java/org/monarchinitiative/lirical/cli/cmd/LiricalConfigurationCommand.java b/...-cli/src/main/java/org/monarchinitiative/lirical/cli/cmd/LiricalConfigurationCommand.java
@@ -26,6 +26,8 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.*;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /**
@@ -35,6 +37,7 @@ abstract class LiricalConfigurationCommand extends BaseCommand {
 
     private static final Logger LOGGER = LoggerFactory.getLogger(LiricalConfigurationCommand.class);
     protected static final String UNKNOWN_VERSION_PLACEHOLDER = "UNKNOWN VERSION";
+    private static final Pattern DISEASE_ID = Pattern.compile("^\\w+:\\w+$");
 
     // ---------------------------------------------- RESOURCES --------------------------------------------------------
     @CommandLine.ArgGroup(validate = false, heading = "Resource paths:%n")
@@ -97,6 +100,16 @@ public static class RunConfiguration {
                 description = "Use Orphanet annotation data (default: ${DEFAULT-VALUE}).")
         public boolean useOrphanet = false;
 
+        @CommandLine.Option(names = {"--target-diseases"},
+                split = ",",
+                paramLabel = "disease",
+                description = {
+                    "Limit the analysis to the provided disease IDs. ",
+                    "(default: analyze all diseases)."
+                }
+        )
+        public List<String> targetDiseases= null;
+
         @CommandLine.Option(names = {"--strict"},
                 description = "Use strict penalties if the genotype does not match the disease model in terms " +
                         "of number of called pathogenic alleles. (default: ${DEFAULT-VALUE}).")
@@ -175,6 +188,16 @@ protected List<String> checkInput() {
             errors.add(msg);
         }
 
+        if (runConfiguration.targetDiseases != null
+                && !runConfiguration.targetDiseases.stream()
+                .allMatch(DISEASE_ID.asMatchPredicate())) {
+            String failures = runConfiguration.targetDiseases.stream()
+                    .filter(Predicate.not(DISEASE_ID.asMatchPredicate()))
+                    .collect(Collectors.joining(","));
+            String msg = "One or more target disease IDs do not look like a compact URI: %s".formatted(failures);
+            errors.add(msg);
+        }
+
         return errors;
     }
 
@@ -234,6 +257,15 @@ protected AnalysisOptions prepareAnalysisOptions(Lirical lirical, GenomeBuild ge
         LOGGER.debug("Using disease databases {}", usedDatabasesSummary);
         builder.setDiseaseDatabases(diseaseDatabases);
 
+        if (runConfiguration.targetDiseases != null) {
+            String usedDiseaseIds = runConfiguration.targetDiseases.stream().collect(Collectors.joining(", ", "[", "]"));
+            LOGGER.debug("Limiting the analysis to the following diseases: {}", usedDiseaseIds);
+            List<TermId> targetDiseases = runConfiguration.targetDiseases.stream()
+                    .map(TermId::of)
+                    .toList();
+            builder.setTargetDiseases(targetDiseases);
+        }
+
         // The rest..
         LOGGER.debug("Variants with pathogenicity score >{} are considered deleterious", runConfiguration.pathogenicityThreshold);
         builder.variantDeleteriousnessThreshold(runConfiguration.pathogenicityThreshold);

diff --git a/lirical-configuration/pom.xml b/lirical-configuration/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>LIRICAL</artifactId>
         <groupId>org.monarchinitiative.lirical</groupId>
-        <version>2.0.3</version>
+        <version>2.0.4</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 

diff --git a/lirical-core/pom.xml b/lirical-core/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>LIRICAL</artifactId>
         <groupId>org.monarchinitiative.lirical</groupId>
-        <version>2.0.3</version>
+        <version>2.0.4</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 

diff --git a/lirical-core/src/main/java/org/monarchinitiative/lirical/core/analysis/AnalysisOptions.java b/lirical-core/src/main/java/org/monarchinitiative/lirical/core/analysis/AnalysisOptions.java
@@ -4,6 +4,7 @@
 import org.monarchinitiative.lirical.core.model.GenomeBuild;
 import org.monarchinitiative.lirical.core.model.TranscriptDatabase;
 import org.monarchinitiative.phenol.annotations.io.hpo.DiseaseDatabase;
+import org.monarchinitiative.phenol.ontology.data.TermId;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -33,6 +34,14 @@ static Builder builder() {
      */
     Set<DiseaseDatabase> diseaseDatabases();
 
+    /**
+     * Limit the analysis to specific diseases.
+     *
+     * @return a collection of disease IDs of the diseases of interest or {@code null}
+     * if <em>all</em> diseases should be tested.
+     */
+    Collection<TermId> targetDiseases();
+
     /**
      * @return threshold for determining if the variant is deleterious or not.
      * The threshold range must be in range of <code>[0,1]</code>.
@@ -71,14 +80,16 @@ static Builder builder() {
 
     /**
      * A builder for {@link AnalysisOptions}.
+     * <p>
+     * The builder is <em>NOT</em> thread safe!
      */
     class Builder {
 
         private static final Logger LOGGER = LoggerFactory.getLogger(Builder.class);
-
         private GenomeBuild genomeBuild = GenomeBuild.HG38;
         private TranscriptDatabase transcriptDatabase = TranscriptDatabase.REFSEQ;
         private final Set<DiseaseDatabase> diseaseDatabases = new HashSet<>(List.of(DiseaseDatabase.OMIM, DiseaseDatabase.DECIPHER));
+        private Set<TermId> targetDiseases = null;  // null = test all diseases
         private float variantDeleteriousnessThreshold = .8f;
         private double defaultVariantBackgroundFrequency = .1;
         private boolean useStrictPenalties = false;
@@ -135,6 +146,42 @@ public Builder setDiseaseDatabases(Collection<DiseaseDatabase> diseaseDatabases)
             return this;
         }
 
+        public Builder clearTargetDiseases() {
+            if (this.targetDiseases != null)
+                this.targetDiseases.clear();
+            return this;
+        }
+
+        public Builder addTargetDiseases(TermId... diseaseIds) {
+            return addTargetDiseases(Arrays.asList(diseaseIds));
+        }
+
+        public Builder addTargetDiseases(Collection<TermId> diseaseIds) {
+            if (diseaseIds == null) {
+                LOGGER.warn("Target disease IDs must not be `null`!");
+                return this;
+            }
+
+            if (this.targetDiseases == null) this.targetDiseases = new HashSet<>();
+
+            this.targetDiseases.addAll(diseaseIds);
+
+            return this;
+        }
+
+        public Builder setTargetDiseases(Collection<TermId> diseaseIds) {
+            if (diseaseIds == null) {
+                LOGGER.warn("Target disease IDs must not be `null`!");
+                return this;
+            }
+
+            if (this.targetDiseases == null) this.targetDiseases = new HashSet<>();
+
+            this.targetDiseases.clear();
+            this.targetDiseases.addAll(diseaseIds);
+            return this;
+        }
+
         public Builder variantDeleteriousnessThreshold(float variantDeleteriousnessThreshold) {
             this.variantDeleteriousnessThreshold = variantDeleteriousnessThreshold;
             return this;
@@ -169,6 +216,7 @@ public AnalysisOptions build() {
             return new AnalysisOptionsDefault(genomeBuild,
                     transcriptDatabase,
                     diseaseDatabases,
+                    targetDiseases,
                     variantDeleteriousnessThreshold,
                     defaultVariantBackgroundFrequency,
                     useStrictPenalties,

diff --git a/...ore/src/main/java/org/monarchinitiative/lirical/core/analysis/AnalysisOptionsDefault.java b/...ore/src/main/java/org/monarchinitiative/lirical/core/analysis/AnalysisOptionsDefault.java
@@ -4,13 +4,16 @@
 import org.monarchinitiative.lirical.core.model.GenomeBuild;
 import org.monarchinitiative.lirical.core.model.TranscriptDatabase;
 import org.monarchinitiative.phenol.annotations.io.hpo.DiseaseDatabase;
+import org.monarchinitiative.phenol.ontology.data.TermId;
 
+import java.util.Collection;
 import java.util.Set;
 
 record AnalysisOptionsDefault(
         GenomeBuild genomeBuild,
         TranscriptDatabase transcriptDatabase,
         Set<DiseaseDatabase> diseaseDatabases,
+        Collection<TermId> targetDiseases,
         float variantDeleteriousnessThreshold,
         double defaultVariantBackgroundFrequency,
         boolean useStrictPenalties,

diff --git a/...main/java/org/monarchinitiative/lirical/core/analysis/impl/LiricalAnalysisRunnerImpl.java b/...main/java/org/monarchinitiative/lirical/core/analysis/impl/LiricalAnalysisRunnerImpl.java
@@ -17,6 +17,7 @@
 import java.util.*;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ForkJoinPool;
+import java.util.function.Predicate;
 import java.util.stream.Stream;
 
 public class LiricalAnalysisRunnerImpl implements LiricalAnalysisRunner {
@@ -48,22 +49,20 @@ private LiricalAnalysisRunnerImpl(PhenotypeService phenotypeService,
 
     @Override
     public AnalysisResults run(AnalysisData data, AnalysisOptions options) throws LiricalAnalysisException {
-        Collection<String> diseaseDatabasePrefixes = options.diseaseDatabases().stream()
-                .map(DiseaseDatabase::prefix)
-                .toList();
         Map<TermId, List<Gene2Genotype>> diseaseToGenotype = groupDiseasesByGene(data.genes());
 
         Optional<GenotypeLikelihoodRatio> genotypeLikelihoodRatio = configureGenotypeLikelihoodRatio(options.genomeBuild(),
                 options.variantDeleteriousnessThreshold(),
                 options.defaultVariantBackgroundFrequency(),
                 options.useStrictPenalties());
-        if (genotypeLikelihoodRatio.isEmpty())
+        if (genotypeLikelihoodRatio.isEmpty()) {
             throw new LiricalAnalysisException("Cannot configure genotype LR for %s".formatted(options.genomeBuild()));
+        }
 
         ProgressReporter progressReporter = new ProgressReporter(1_000, "diseases");
         Stream<TestResult> testResultStream = phenotypeService.diseases().hpoDiseases()
                 .parallel() // why not?
-                .filter(disease -> diseaseDatabasePrefixes.contains(disease.id().getPrefix()))
+                .filter(prepareDiseaseFilter(options.diseaseDatabases(), options.targetDiseases()))
                 .peek(d -> progressReporter.log())
                 .map(disease -> analyzeDisease(genotypeLikelihoodRatio.get(), disease, data, options, diseaseToGenotype))
                 .flatMap(Optional::stream);
@@ -78,6 +77,22 @@ public AnalysisResults run(AnalysisData data, AnalysisOptions options) throws Li
         }
     }
 
+    private static Predicate<HpoDisease> prepareDiseaseFilter(
+            Set<DiseaseDatabase> diseaseDatabasePrefixes,
+            Collection<TermId> targetDiseases
+    ) {
+        if (targetDiseases == null) {
+            // Restrict the analysis to the disease with the chosen prefixes.
+            List<String> prefixes = diseaseDatabasePrefixes.stream()
+                    .map(DiseaseDatabase::prefix)
+                    .toList();
+            return disease -> prefixes.contains(disease.id().getPrefix());
+        } else {
+            // Restrict the analysis to the selected diseases.
+            return disease -> targetDiseases.contains(disease.id());
+        }
+    }
+
     private Map<TermId, List<Gene2Genotype>> groupDiseasesByGene(GenesAndGenotypes genes) {
         Map<TermId, Collection<TermId>> geneToDisease = phenotypeService.associationData().associations().geneIdToDiseaseIds();
         Map<TermId, List<Gene2Genotype>> diseaseToGenotype = new HashMap<>(genes.size());

diff --git a/...ain/java/org/monarchinitiative/lirical/core/likelihoodratio/PhenotypeLikelihoodRatio.java b/...ain/java/org/monarchinitiative/lirical/core/likelihoodratio/PhenotypeLikelihoodRatio.java
@@ -16,7 +16,7 @@
 
 /**
  * This class is designed to calculate the background and foreground frequencies of any HPO term in any disease
- * (This is calculated by {@link #initializeFrequencyMap()} and stored in {@link #hpoTerm2OverallFrequency}).
+ * (This is calculated by {@link #initializeFrequencyMap(MinimalOntology, HpoDiseases)} and stored in {@link #hpoTerm2OverallFrequency}).
  * The main entry point into this class is the function {@link #lrForObservedTerm}, which is called by
  * {@link LiricalAnalysisRunner} once for each HPO term
  * to which the case is annotation; it calls it once for each disease in our
@@ -39,11 +39,9 @@ public class PhenotypeLikelihoodRatio {
     public static final float DEFAULT_TERM_FREQUENCY = 1.f; // TODO - is this the right thing to do?
     /** The HPO ontology with all of its subontologies. */
     private final MinimalOntology ontology;
-    /** This map has one entry for each disease in our database. Key--the disease ID, e.g., OMIM:600200.*/
-    private final Map<TermId, HpoDisease> diseaseMap;
     private final LrWithExplanationFactory explanationFactory;
     /** Overall, i.e., background frequency of each HPO term. */
-    private Map<TermId, Double> hpoTerm2OverallFrequency = null;
+    private final Map<TermId, Double> hpoTerm2OverallFrequency;
     /**
      * This is the probability of a finding if the disease is not annotated to it and there
      * is no common ancestor except the root. There are many possible causes of findings called
@@ -62,9 +60,8 @@ public class PhenotypeLikelihoodRatio {
      */
     public PhenotypeLikelihoodRatio(MinimalOntology ontology, HpoDiseases diseases) {
         this.ontology = ontology;
-        this.diseaseMap = diseases.diseaseById();
         this.explanationFactory = new LrWithExplanationFactory(ontology); // TODO - DI?
-        initializeFrequencyMap();
+        this.hpoTerm2OverallFrequency = initializeFrequencyMap(ontology, diseases);
     }
 
     /**
@@ -153,7 +150,7 @@ public LrWithExplanation lrForObservedTerm(TermId queryTid, InducedDiseaseGraph
             }
             // If we get here, queryId is not directly annotated in the disease, and it is not a child
             // of a disease term, nor is a disease term a subclass of queryTid. The next bit of code
-            // checks whether they have a common ancestor that is more specfic that Phenotypic_Abnormality
+            // checks whether they have a common ancestor that is more specific that Phenotypic_Abnormality
             Term2Freq t2f = idg.getClosestAncestor(queryTid, ontology);
             if (t2f.nonRootCommonAncestor()) {
                 double numerator = t2f.frequency();
@@ -312,14 +309,25 @@ private double getProportionInChildren(TermId queryTid, TermId diseaseTid) {
 
     /**
      * Initialize the {@link #hpoTerm2OverallFrequency} object that has the background frequencies of each of the
-     * HPO terms in the ontology. */
-    private void initializeFrequencyMap() {
+     * HPO terms in the ontology.
+     *
+     * @return the frequency map
+     */
+    private static Map<TermId, Double> initializeFrequencyMap(
+            MinimalOntology ontology,
+            HpoDiseases diseases
+    ) {
         Map<TermId, Double> mp = new HashMap<>();
         for (TermId tid : ontology.nonObsoleteTermIds()) {
             mp.put(tid, 0.0D);
         }
-        Map<TermId, Double> mapbuilder = new HashMap<>();
-        for (HpoDisease dis : this.diseaseMap.values()) {
+
+        Set<TermId> seenDiseases = new HashSet<>(); // Make sure we only see each disease once.
+        for (HpoDisease dis : diseases) {
+            if (!seenDiseases.add(dis.id()))
+                // `add` returns true if the ID was added (was new).
+                // Therefore, we continue if the ID was NOT added.
+                continue;
             // We construct a map in order to get the maximum frequencies for any
             // given ancestor term, also in order to avoid double counting.
             Map<TermId, Double> updateMap=new HashMap<>();
@@ -349,19 +357,14 @@ private void initializeFrequencyMap() {
                                 : previous + updateMap.get(tid)); // cumulative
             }
         }
+        Map<TermId, Double> builder = new HashMap<>();
         // Now we need to normalize by the number of diseases.
-        double N = getNumberOfDiseases();
         for (Map.Entry<TermId, Double> me : mp.entrySet()) {
-            double f = me.getValue() / N;
-            mapbuilder.put(me.getKey(), f);
+            double f = me.getValue() / diseases.size();
+            builder.put(me.getKey(), f);
         }
-        hpoTerm2OverallFrequency = Map.copyOf(mapbuilder);
-        logger.trace("Got data on background frequency for " + hpoTerm2OverallFrequency.size() + " terms");
-    }
-
-    /** @return the number of diseases we are using for the calculations. */
-    private int getNumberOfDiseases() {
-        return diseaseMap.size();
+        logger.trace("Got data on background frequency for {} terms", builder.size());
+        return Map.copyOf(builder);
     }
 
 }
diff --git a/lirical-exomiser-db-adapter/pom.xml b/lirical-exomiser-db-adapter/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>LIRICAL</artifactId>
         <groupId>org.monarchinitiative.lirical</groupId>
-        <version>2.0.3</version>
+        <version>2.0.4</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>