Almost there

Signed-off-by: Adam Li <[email protected]>
py-why · Mar 11, 2024 · 423b39b · 423b39b
1 parent 9ee8d2f
commit 423b39b
Show file tree

Hide file tree

Showing 12 changed files with 31,815 additions and 1,262 deletions.
diff --git a/doc/references.bib b/doc/references.bib
@@ -75,6 +75,13 @@ @article{hauser2012characterization
   publisher = {JMLR. org}
 }
 
+@article{heckerman2013learning,
+  title={Learning bayesian networks: The combination of knowledge and statistical data},
+  author={Heckerman, David and Geiger, Dan and Chickering, David Maxwell},
+  journal={arXiv preprint arXiv:1302.6815},
+  year={2013}
+}
+
 @article{Jaber2020causal,
   title   = {Causal discovery from soft interventions with unknown targets: Characterization and learning},
   author  = {Jaber, Amin and Kocaoglu, Murat and Shanmugam, Karthikeyan and Bareinboim, Elias},
@@ -317,6 +324,15 @@ @article{Sen2017model
   year    = {2017}
 }
 
+@inproceedings{scutari2016empirical,
+  title        = {An empirical-Bayes score for discrete Bayesian networks},
+  author       = {Scutari, Marco},
+  booktitle    = {Conference on probabilistic graphical models},
+  pages        = {438--448},
+  year         = {2016},
+  organization = {PMLR}
+}
+
 
 @inproceedings{Yu2020Bregman,
   title     = {Measuring the Discrepancy between Conditional Distributions: Methods, Properties and Applications},
@@ -349,6 +365,15 @@ @inproceedings{Zhang2011
 
 % Example refs
 
+@book{koller2009probabilistic,
+  author    = {Koller, Daphne and Friedman, Nir},
+  title     = {Probabilistic Graphical Models: Principles and Techniques - Adaptive Computation and Machine Learning},
+  year      = {2009},
+  isbn      = {0262013193},
+  publisher = {The MIT Press},
+  abstract  = {Most tasks require a person or an automated system to reasonto reach conclusions based on available information. The framework of probabilistic graphical models, presented in this book, provides a general approach for this task. The approach is model-based, allowing interpretable models to be constructed and then manipulated by reasoning algorithms. These models can also be learned automatically from data, allowing the approach to be used in cases where manually constructing a model is difficult or even impossible. Because uncertainty is an inescapable aspect of most real-world applications, the book focuses on probabilistic models, which make the uncertainty explicit and provide models that are more faithful to reality. Probabilistic Graphical Models discusses a variety of models, spanning Bayesian networks, undirected Markov networks, discrete and continuous models, and extensions to deal with dynamical systems and relational data. For each class of models, the text describes the three fundamental cornerstones: representation, inference, and learning, presenting both basic concepts and advanced techniques. Finally, the book considers the use of the proposed framework for causal reasoning and decision making under uncertainty. The main text in each chapter provides the detailed technical development of the key ideas. Most chapters also include boxes with additional material: skill boxes, which describe techniques; case study boxes, which discuss empirical cases related to the approach described in the text, including applications in computer vision, robotics, natural language understanding, and computational biology; and concept boxes, which present significant concepts drawn from the material in the chapter. Instructors (and readers) can group chapters in various combinations, from core topics to more technically advanced material, to suit their particular needs. Adaptive Computation and Machine Learning series}
+}
+
 @article{sachsdataset2005,
   author  = {Karen Sachs  and Omar Perez  and Dana Pe'er  and Douglas A. Lauffenburger  and Garry P. Nolan },
   title   = {Causal Protein-Signaling Networks Derived from Multiparameter Single-Cell Data},

diff --git a/dodiscover/score/score_function.py b/dodiscover/score/score_function.py
@@ -1,13 +1,18 @@
-from typing import Callable, Dict
+# Adapted from: https://github.com/juangamella/ges/
+# BSD 3-Clause License
+
+from typing import Callable, Union, Dict
 
 import numpy as np
 import pandas as pd
 
 
 class ScoreFunction:
-    def __init__(self, score: Callable) -> None:
+    def __init__(self, score: Union[Callable, str]='bic') -> None:
         self._cache: Dict = dict()
-        self.score_func = score
+
+        if score == 'bic':
+            self.score_func = bic_score
 
     def local_score(self, data: pd.DataFrame, source, source_parents) -> float:
         """Compute the local score of an edge.
@@ -76,3 +81,126 @@ def full_score(self, A):
         l0_term = self.lmbda * (np.sum(A != 0) + 1 * self.p)
         score = -0.5 * likelihood - l0_term
         return score
+
+
+def _mle(data, source, source_parents):
+    """Compute the maximum likelihood estimates of the parameters of a linear Gaussian model.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        The dataset.
+    source : Node
+        The origin node.
+    source_parents : list of Node
+        The parents of the source.
+
+    Returns
+    -------
+    beta : np.array
+        The MLE of the coefficients.
+    sigma : float
+        The MLE of the noise term variance.
+    """
+    _, n_features = data.shape
+    beta = np.zeros(n_features)
+
+    # compute the MLE of the coefficients
+    # using leaset squares regression
+    Y = data[source].to_numpy()
+
+    if len(source_parents) > 0:
+        X = data[source_parents].to_numpy()
+        parents_coef = np.linalg.lstsq(X, Y, rcond=None)[0]
+        parents_idx = [data.columns.get_loc(p) for p in source_parents]
+        beta[parents_idx] = parents_coef
+
+        # compute the estimate of the noise-term variance
+        sigma = np.var(Y - X @ parents_coef)
+
+    # XXX: it is possible to compute things using the empirical covariance matrix
+    # beta = (\Sigma_{source, Pa(source)} @ \Sigma_{Pa(source), Pa(source)})^{-1}
+
+    if sigma < 0:
+        sigma = 1e-5
+
+    return beta, sigma
+
+
+def bic_score(data, source, source_parents):
+    """Compute the Bayesian Information Criterion (BIC) score of an edge.
+
+    Implements the BIC score described in :footcite:`koller2009probabilistic`.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Dataset.
+    source : Node
+        Variable to score.
+    source_parents : list of Node
+        The parents of the source.
+    """
+    n_samples = len(data)
+
+    # compute MLE
+    _, sigma = _mle(data, source, source_parents)
+
+    # compute log-likelihood
+    likelihood = -0.5 * n_samples * (1 + np.log(sigma))
+
+    # penalty term
+    l0_term = 0.5 * np.log(n_samples) * (len(source_parents) + 1)
+    return likelihood - l0_term
+
+
+def bdeu_score(data, source, source_parents):
+    """Compute the Bayesian Dirichlet equivalent uniform (BDeu) score of an edge.
+
+    Implements the BDeu score described in :footcite:`koller2009probabilistic`
+    and :footcite:`heckerman2013learning`.
+
+    Parameters
+    ----------
+    data : _type_
+        _description_
+    source : _type_
+        _description_
+    source_parents : _type_
+        _description_
+    """
+    pass
+
+
+def bds_score(data, source, source_parents):
+    """Compute the Bayesian Dirichlet sparse (BDs) score of an edge.
+
+    Implements the score described in :footcite:`koller2009probabilistic`.
+
+    Parameters
+    ----------
+    data : _type_
+        _description_
+    source : _type_
+        _description_
+    source_parents : _type_
+        _description_
+    """
+    pass
+
+
+def k2_score(data, source, source_parents):
+    """Compute the K2 score of an edge.
+
+    Implements the score described in :footcite:`scutari2016empirical`.
+
+    Parameters
+    ----------
+    data : _type_
+        _description_
+    source : _type_
+        _description_
+    source_parents : _type_
+        _description_
+    """
+    pass
diff --git a/dodiscover/testdata/ges/README.txt b/dodiscover/testdata/ges/README.txt
@@ -0,0 +1,2 @@
+The datasets were obtained from causal-learn (https://github.com/py-why/causal-learn/tree/main/tests/TestData) on December 1st, 2023.
+These are used to compare and test the results of native-implementation of GES in dodiscover.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		The datasets were obtained from causal-learn (https://github.com/py-why/causal-learn/tree/main/tests/TestData) on December 1st, 2023.
		These are used to compare and test the results of native-implementation of GES in dodiscover.