diff --git a/causallearn/score/LocalScoreFunction.py b/causallearn/score/LocalScoreFunction.py index e02ba737..a4af5361 100644 --- a/causallearn/score/LocalScoreFunction.py +++ b/causallearn/score/LocalScoreFunction.py @@ -34,9 +34,9 @@ def local_score_BIC(Data: ndarray, i: int, PAi: List[int], parameters=None) -> f if len(PAi) == 0: return n * np.log(cov[i, i]) - yX = np.mat(cov[np.ix_([i], PAi)]) - XX = np.mat(cov[np.ix_(PAi, PAi)]) - H = np.log(cov[i, i] - yX * np.linalg.inv(XX) * yX.T) + yX = cov[np.ix_([i], PAi)] + XX = cov[np.ix_(PAi, PAi)] + H = np.log(cov[i, i] - yX @ np.linalg.inv(XX) @ yX.T) return n * H + np.log(n) * len(PAi) * lambda_value @@ -68,9 +68,9 @@ def local_score_BIC_from_cov( if len(PAi) == 0: return n * np.log(cov[i, i]) - yX = np.mat(cov[np.ix_([i], PAi)]) - XX = np.mat(cov[np.ix_(PAi, PAi)]) - H = np.log(cov[i, i] - yX * np.linalg.inv(XX) * yX.T) + yX = cov[np.ix_([i], PAi)] + XX = cov[np.ix_(PAi, PAi)] + H = np.log(cov[i, i] - yX @ np.linalg.inv(XX) @ yX.T) return n * H + np.log(n) * len(PAi) * lambda_value diff --git a/causallearn/score/LocalScoreFunctionClass.py b/causallearn/score/LocalScoreFunctionClass.py index 41ffb49f..8171d515 100644 --- a/causallearn/score/LocalScoreFunctionClass.py +++ b/causallearn/score/LocalScoreFunctionClass.py @@ -46,3 +46,9 @@ def score(self, i: int, PAi: List[int]) -> float: self.score_cache[i][hash_key] = self.local_score_fun(self.data, i, PAi, self.parameters) return self.score_cache[i][hash_key] + + def score_nocache(self, i: int, PAi: List[int]) -> float: + if self.local_score_fun == local_score_BIC_from_cov: + return self.local_score_fun((self.cov, self.n), i, PAi, self.parameters) + else: + return self.local_score_fun(self.data, i, PAi, self.parameters) \ No newline at end of file diff --git a/causallearn/search/PermutationBased/BOSS.py b/causallearn/search/PermutationBased/BOSS.py new file mode 100644 index 00000000..465d3659 --- /dev/null +++ b/causallearn/search/PermutationBased/BOSS.py @@ -0,0 +1,207 @@ +import random +import sys +import time +import warnings +from typing import Any, Dict, List, Optional + +import numpy as np +from causallearn.graph.GeneralGraph import GeneralGraph +from causallearn.graph.GraphNode import GraphNode +from causallearn.score.LocalScoreFunction import ( + local_score_BDeu, + local_score_BIC, + local_score_BIC_from_cov, + local_score_cv_general, + local_score_cv_multi, + local_score_marginal_general, + local_score_marginal_multi, +) +from causallearn.search.PermutationBased.gst import GST; +from causallearn.score.LocalScoreFunctionClass import LocalScoreClass +from causallearn.utils.DAG2CPDAG import dag2cpdag + + +def boss( + X: np.ndarray, + score_func: str = "local_score_BIC_from_cov", + parameters: Optional[Dict[str, Any]] = None, + verbose: Optional[bool] = True, + node_names: Optional[List[str]] = None, +) -> GeneralGraph: + """ + Perform a best order score search (BOSS) algorithm + + Parameters + ---------- + X : data set (numpy ndarray), shape (n_samples, n_features). The input data, where n_samples is the number of samples and n_features is the number of features. + score_func : the string name of score function. (str(one of 'local_score_CV_general', 'local_score_marginal_general', + 'local_score_CV_multi', 'local_score_marginal_multi', 'local_score_BIC', 'local_score_BIC_from_cov', 'local_score_BDeu')). + parameters : when using CV likelihood, + parameters['kfold']: k-fold cross validation + parameters['lambda']: regularization parameter + parameters['dlabel']: for variables with multi-dimensions, + indicate which dimensions belong to the i-th variable. + verbose : whether to print the time cost and verbose output of the algorithm. + + Returns + ------- + G : learned causal graph, where G.graph[j,i] = 1 and G.graph[i,j] = -1 indicates i --> j, G.graph[i,j] = G.graph[j,i] = -1 indicates i --- j. + """ + + X = X.copy() + n, p = X.shape + if n < p: + warnings.warn("The number of features is much larger than the sample size!") + + if score_func == "local_score_CV_general": + # % k-fold negative cross validated likelihood based on regression in RKHS + if parameters is None: + parameters = { + "kfold": 10, # 10 fold cross validation + "lambda": 0.01, + } # regularization parameter + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_cv_general, parameters=parameters + ) + elif score_func == "local_score_marginal_general": + # negative marginal likelihood based on regression in RKHS + parameters = {} + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_marginal_general, parameters=parameters + ) + elif score_func == "local_score_CV_multi": + # k-fold negative cross validated likelihood based on regression in RKHS + # for data with multi-variate dimensions + if parameters is None: + parameters = { + "kfold": 10, + "lambda": 0.01, + "dlabel": {}, + } # regularization parameter + for i in range(X.shape[1]): + parameters["dlabel"]["{}".format(i)] = i + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_cv_multi, parameters=parameters + ) + elif score_func == "local_score_marginal_multi": + # negative marginal likelihood based on regression in RKHS + # for data with multi-variate dimensions + if parameters is None: + parameters = {"dlabel": {}} + for i in range(X.shape[1]): + parameters["dlabel"]["{}".format(i)] = i + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_marginal_multi, parameters=parameters + ) + elif score_func == "local_score_BIC": + # SEM BIC score + warnings.warn("Please use 'local_score_BIC_from_cov' instead") + if parameters is None: + parameters = {"lambda_value": 2} + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_BIC, parameters=parameters + ) + elif score_func == "local_score_BIC_from_cov": + # SEM BIC score + if parameters is None: + parameters = {"lambda_value": 2} + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_BIC_from_cov, parameters=parameters + ) + elif score_func == "local_score_BDeu": + # BDeu score + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_BDeu, parameters=None + ) + else: + raise Exception("Unknown function!") + + score = localScoreClass + gsts = [GST(i, score) for i in range(p)] + + node_names = [("X%d" % (i + 1)) for i in range(p)] if node_names is None else node_names + nodes = [] + + for name in node_names: + node = GraphNode(name) + nodes.append(node) + + G = GeneralGraph(nodes) + + runtime = time.perf_counter() + + order = [v for v in range(p)] + + gsts = [GST(v, score) for v in order] + parents = {v: [] for v in order} + + variables = [v for v in order] + while True: + improved = False + random.shuffle(variables) + if verbose: + for i, v in enumerate(order): + parents[v].clear() + gsts[v].trace(order[:i], parents[v]) + sys.stdout.write("\rBOSS edge count: %i " % np.sum([len(parents[v]) for v in range(p)])) + sys.stdout.flush() + + for v in variables: + improved |= better_mutation(v, order, gsts) + if not improved: break + + for i, v in enumerate(order): + parents[v].clear() + gsts[v].trace(order[:i], parents[v]) + + runtime = time.perf_counter() - runtime + + if verbose: + sys.stdout.write("\nBOSS completed in: %.2fs \n" % runtime) + sys.stdout.flush() + + for y in range(p): + for x in parents[y]: + G.add_directed_edge(nodes[x], nodes[y]) + + G = dag2cpdag(G) + + return G + + +def reversed_enumerate(iter, j): + for w in reversed(iter): + yield j, w + j -= 1 + + +def better_mutation(v, order, gsts): + i = order.index(v) + p = len(order) + scores = np.zeros(p + 1) + + prefix = [] + score = 0 + for j, w in enumerate(order): + scores[j] = gsts[v].trace(prefix) + score + if v != w: + score += gsts[w].trace(prefix) + prefix.append(w) + + scores[p] = gsts[v].trace(prefix) + score + best = p + + prefix.append(v) + score = 0 + for j, w in reversed_enumerate(order, p - 1): + if v != w: + prefix.remove(w) + score += gsts[w].trace(prefix) + scores[j] += score + if scores[j] > scores[best]: best = j + + if scores[i] + 1e-6 > scores[best]: return False + order.remove(v) + order.insert(best - int(best > i), v) + + return True \ No newline at end of file diff --git a/causallearn/search/PermutationBased/GRaSP.py b/causallearn/search/PermutationBased/GRaSP.py index abcbf47c..f4d4aa14 100644 --- a/causallearn/search/PermutationBased/GRaSP.py +++ b/causallearn/search/PermutationBased/GRaSP.py @@ -10,11 +10,13 @@ from causallearn.score.LocalScoreFunction import ( local_score_BDeu, local_score_BIC, + local_score_BIC_from_cov, local_score_cv_general, local_score_cv_multi, local_score_marginal_general, local_score_marginal_multi, ) +from causallearn.search.PermutationBased.gst import GST; from causallearn.score.LocalScoreFunctionClass import LocalScoreClass from causallearn.utils.DAG2CPDAG import dag2cpdag @@ -33,6 +35,7 @@ def __init__(self, p, score): y = self.order[i] self.parents[y] = [] self.local_scores[y] = -score.score(y, []) + # self.local_scores[y] = -score.score_nocache(y, []) def get(self, i): return self.order[i] @@ -76,9 +79,8 @@ def len(self): def grasp( X: np.ndarray, - score_func: str = "local_score_BIC", + score_func: str = "local_score_BIC_from_cov", depth: Optional[int] = 3, - maxP: Optional[float] = None, parameters: Optional[Dict[str, Any]] = None, verbose: Optional[bool] = True, node_names: Optional[List[str]] = None, @@ -90,9 +92,8 @@ def grasp( ---------- X : data set (numpy ndarray), shape (n_samples, n_features). The input data, where n_samples is the number of samples and n_features is the number of features. score_func : the string name of score function. (str(one of 'local_score_CV_general', 'local_score_marginal_general', - 'local_score_CV_multi', 'local_score_marginal_multi', 'local_score_BIC', 'local_score_BDeu')). + 'local_score_CV_multi', 'local_score_marginal_multi', 'local_score_BIC', 'local_score_BIC_from_cov', 'local_score_BDeu')). depth : allowed maximum depth for DFS - maxP : allowed maximum number of parents when searching the graph parameters : when using CV likelihood, parameters['kfold']: k-fold cross validation parameters['lambda']: regularization parameter @@ -105,38 +106,29 @@ def grasp( G : learned causal graph, where G.graph[j,i] = 1 and G.graph[i,j] = -1 indicates i --> j, G.graph[i,j] = G.graph[j,i] = -1 indicates i --- j. """ + X = X.copy() n, p = X.shape if n < p: warnings.warn("The number of features is much larger than the sample size!") - X = np.mat(X) - if ( - score_func == "local_score_CV_general" - ): # % k-fold negative cross validated likelihood based on regression in RKHS + if score_func == "local_score_CV_general": + # k-fold negative cross validated likelihood based on regression in RKHS if parameters is None: parameters = { "kfold": 10, # 10 fold cross validation "lambda": 0.01, } # regularization parameter - if maxP is None: - maxP = X.shape[1] / 2 # maximum number of parents localScoreClass = LocalScoreClass( data=X, local_score_fun=local_score_cv_general, parameters=parameters ) - - elif ( - score_func == "local_score_marginal_general" - ): # negative marginal likelihood based on regression in RKHS + elif score_func == "local_score_marginal_general": + # negative marginal likelihood based on regression in RKHS parameters = {} - if maxP is None: - maxP = X.shape[1] / 2 # maximum number of parents localScoreClass = LocalScoreClass( data=X, local_score_fun=local_score_marginal_general, parameters=parameters ) - - elif ( - score_func == "local_score_CV_multi" - ): # k-fold negative cross validated likelihood based on regression in RKHS + elif score_func == "local_score_CV_multi": + # k-fold negative cross validated likelihood based on regression in RKHS # for data with multi-variate dimensions if parameters is None: parameters = { @@ -146,45 +138,44 @@ def grasp( } # regularization parameter for i in range(X.shape[1]): parameters["dlabel"]["{}".format(i)] = i - if maxP is None: - maxP = len(parameters["dlabel"]) / 2 localScoreClass = LocalScoreClass( data=X, local_score_fun=local_score_cv_multi, parameters=parameters ) - - elif ( - score_func == "local_score_marginal_multi" - ): # negative marginal likelihood based on regression in RKHS + elif score_func == "local_score_marginal_multi": + # negative marginal likelihood based on regression in RKHS # for data with multi-variate dimensions if parameters is None: parameters = {"dlabel": {}} for i in range(X.shape[1]): parameters["dlabel"]["{}".format(i)] = i - if maxP is None: - maxP = len(parameters["dlabel"]) / 2 localScoreClass = LocalScoreClass( data=X, local_score_fun=local_score_marginal_multi, parameters=parameters ) - - elif score_func == "local_score_BIC": # Greedy equivalence search with BIC score - parameters = {} - parameters["lambda_value"] = 2 - if maxP is None: - maxP = X.shape[1] / 2 + elif score_func == "local_score_BIC": + # SEM BIC score + warnings.warn("Please use 'local_score_BIC_from_cov' instead") + if parameters is None: + parameters = {"lambda_value": 2} localScoreClass = LocalScoreClass( data=X, local_score_fun=local_score_BIC, parameters=parameters ) - - elif score_func == "local_score_BDeu": # Greedy equivalence search with BDeu score - if maxP is None: - maxP = X.shape[1] / 2 + elif score_func == "local_score_BIC_from_cov": + # SEM BIC score + if parameters is None: + parameters = {"lambda_value": 2} + localScoreClass = LocalScoreClass( + data=X, local_score_fun=local_score_BIC_from_cov, parameters=parameters + ) + elif score_func == "local_score_BDeu": + # BDeu score localScoreClass = LocalScoreClass( data=X, local_score_fun=local_score_BDeu, parameters=None ) - else: raise Exception("Unknown function!") + score = localScoreClass + gsts = [GST(i, score) for i in range(p)] node_names = [("X%d" % (i + 1)) for i in range(p)] if node_names is None else node_names nodes = [] @@ -203,12 +194,11 @@ def grasp( y_parents = order.get_parents(y) candidates = [order.get(j) for j in range(0, i)] - grow(y, y_parents, candidates, score) - local_score = shrink(y, y_parents, score) + local_score = gsts[y].trace(candidates, y_parents) order.set_local_score(y, local_score) order.bump_edges(len(y_parents)) - while dfs(depth - 1, set(), [], order, score): + while dfs(depth - 1, set(), [], order, gsts): if verbose: sys.stdout.write("\rGRaSP edge count: %i " % order.get_edges()) sys.stdout.flush() @@ -229,7 +219,7 @@ def grasp( # performs a dfs over covered tucks -def dfs(depth: int, flipped: set, history: List[set], order, score): +def dfs(depth: int, flipped: set, history: List[set], order, gsts): cache = [{}, {}, {}, 0] @@ -257,7 +247,7 @@ def dfs(depth: int, flipped: set, history: List[set], order, score): cache[3] = order.get_edges() tuck(i, j, order) - edge_bump, score_bump = update(i, j, order, score) + edge_bump, score_bump = update(i, j, order, gsts) # because things that should be zero sometimes are not if score_bump > 1e-6: @@ -276,7 +266,7 @@ def dfs(depth: int, flipped: set, history: List[set], order, score): if len(flipped) > 0 and flipped not in history: history.append(flipped) - if depth > 0 and dfs(depth - 1, flipped, history, order, score): + if depth > 0 and dfs(depth - 1, flipped, history, order, gsts): return True del history[-1] @@ -291,7 +281,7 @@ def dfs(depth: int, flipped: set, history: List[set], order, score): # updates the parents and scores after a tuck -def update(i: int, j: int, order, score): +def update(i: int, j: int, order, gsts): edge_bump = 0 old_score = 0 @@ -304,18 +294,9 @@ def update(i: int, j: int, order, score): edge_bump -= len(z_parents) old_score += order.get_local_score(z) + z_parents.clear() candidates = [order.get(l) for l in range(0, k)] - - for w in [w for w in z_parents if w not in candidates]: - z_parents.remove(w) - shrink(z, z_parents, score) - - for w in z_parents: - candidates.remove(w) - - grow(z, z_parents, candidates, score) - - local_score = shrink(z, z_parents, score) + local_score = gsts[z].trace(candidates, z_parents) order.set_local_score(z, local_score) edge_bump += len(z_parents) @@ -324,64 +305,6 @@ def update(i: int, j: int, order, score): return edge_bump, new_score - old_score -# grow of grow-shrink -def grow(y: int, y_parents: List[int], candidates: List[int], score): - - best = -score.score(y, y_parents) - - add = None - checked = [] - while add is not None or len(candidates) > 0: - - if add is not None: - checked.remove(add) - y_parents.append(add) - candidates = checked - checked = [] - add = None - - while len(candidates) > 0: - - x = candidates.pop() - y_parents.append(x) - current = -score.score(y, y_parents) - y_parents.remove(x) - checked.append(x) - - if current > best: - best = current - add = x - - return best - - -# shrink of grow-shrink -def shrink(y: int, y_parents: List[int], score): - - best = -score.score(y, y_parents) - - remove = None - checked = 0 - while remove is not None or checked < len(y_parents): - - if remove is not None: - y_parents.remove(remove) - checked = 0 - remove = None - - while checked < len(y_parents): - x = y_parents.pop(0) - current = -score.score(y, y_parents) - y_parents.append(x) - checked += 1 - - if current > best: - best = current - remove = x - - return best - - # tucks the node at position i into position j def tuck(i: int, j: int, order): diff --git a/causallearn/search/PermutationBased/gst.py b/causallearn/search/PermutationBased/gst.py new file mode 100644 index 00000000..25e0da6b --- /dev/null +++ b/causallearn/search/PermutationBased/gst.py @@ -0,0 +1,77 @@ +class GSTNode: + + def __init__(self, tree, add=None, score=None): + if score is None: score = -tree.score.score_nocache(tree.vertex, []) + # if score is None: score = -tree.score.score(tree.vertex, []) + self.tree = tree + self.add = add + self.grow_score = score + self.shrink_score = score + self.branches = None + self.remove = None + + def __lt__(self, other): + return self.grow_score < other.grow_score + + def grow(self, available, parents): + self.branches = [] + for add in available: + parents.append(add) + score = -self.tree.score.score_nocache(self.tree.vertex, parents) + # score = -self.tree.score.score(self.tree.vertex, parents) + parents.remove(add) + branch = GSTNode(self.tree, add, score) + if score > self.grow_score: self.branches.append(branch) + self.branches.sort() + + def shrink(self, parents): + self.remove = [] + while True: + best = None + for remove in [parent for parent in parents]: + parents.remove(remove) + score = -self.tree.score.score_nocache(self.tree.vertex, parents) + # score = -self.tree.score.score(self.tree.vertex, parents) + parents.append(remove) + if score > self.shrink_score: + self.shrink_score = score + best = remove + if best is None: break + self.remove.append(best) + parents.remove(best) + + def trace(self, prefix, available, parents): + if self.branches is None: self.grow(available, parents) + for branch in self.branches: + available.remove(branch.add) + if branch.add in prefix: + parents.append(branch.add) + return branch.trace(prefix, available, parents) + if self.remove is None: + self.shrink(parents) + return self.shrink_score + for remove in self.remove: parents.remove(remove) + return self.shrink_score + + +class GST: + + def __init__(self, vertex, score): + self.vertex = vertex + self.score = score + self.root = GSTNode(self) + self.forbidden = [vertex] + self.required = [] + + def trace(self, prefix, parents=None): + if parents is None: parents = [] + available = [i for i in range(self.score.data.shape[1]) if i not in self.forbidden] + return self.root.trace(prefix, available, parents) + + def set_knowledge(self, forbidden, required): + self.forbidden = forbidden # not implemented + self.required = required # not implemented + self.reset() + + def reset(self): + self.root = GSTNode(self) \ No newline at end of file diff --git a/tests/TestBOSS.py b/tests/TestBOSS.py new file mode 100644 index 00000000..ea963d47 --- /dev/null +++ b/tests/TestBOSS.py @@ -0,0 +1,117 @@ +import unittest + +import numpy as np +from causallearn.graph.AdjacencyConfusion import AdjacencyConfusion +from causallearn.graph.ArrowConfusion import ArrowConfusion +from causallearn.graph.GeneralGraph import GeneralGraph +from causallearn.graph.GraphNode import GraphNode +from causallearn.search.PermutationBased.BOSS import boss +from causallearn.utils.DAG2CPDAG import dag2cpdag + +import gc + + +def simulate_data(p, d, n): + """ + Randomly generates an Erdos-Renyi direct acyclic graph given an ordering + and randomly simulates data with the provided parameters. + + p = |variables| + d = |edges| / |possible edges| + n = sample size + """ + + # npe = |possible edges| + pe = int(p * (p - 1) / 2) + + # ne = |edges| + ne = int(d * pe) + + # generate edges + e = np.append(np.zeros(pe - ne), 0.5 * np.random.uniform(-1, 1, ne)) + np.random.shuffle(e) + B = np.zeros([p, p]) + B.T[np.triu_indices(p, 1)] = e + + # generate variance terms + O = np.diag(np.ones(p)) + + # simulate data + X = np.random.multivariate_normal(np.zeros(p), O, n) + for i in range(p): + J = np.where(B[i])[0] + for j in J: X[:, i] += B[i, j] * X[:, j] + + pi = [i for i in range(p)] + np.random.shuffle(pi) + + return (B != 0)[pi][:, pi], X[:, pi] + + +class TestBOSS(unittest.TestCase): + def test_boss(self): + ps = [30, 60] + ds = [0.1, 0.15] + n = 1000 + reps = 5 + + gc.set_threshold(20000, 50, 50) + # gc.set_debug(gc.DEBUG_STATS) + + for p in ps: + for d in ds: + stats = [[], [], [], []] + for rep in range(1, reps + 1): + g0, X = simulate_data(p, d, n) + print( + "\nNodes:", p, + "| Edges:", np.sum(g0), + "| Avg Degree:", 2 * round(np.sum(g0) / p, 2), + "| Rep:", rep, + ) + + node_names = [("X%d" % (i + 1)) for i in range(p)] + nodes = [] + + for name in node_names: + node = GraphNode(name) + nodes.append(node) + + G0 = GeneralGraph(nodes) + for y in range(p): + for x in np.where(g0[y] == 1)[0]: + G0.add_directed_edge(nodes[x], nodes[y]) + + G0 = dag2cpdag(G0) + + G = boss(X, parameters={'lambda_value': 4}) + gc.collect() + + AdjC = AdjacencyConfusion(G0, G) + stats[0].append(AdjC.get_adj_precision()) + stats[1].append(AdjC.get_adj_recall()) + + ArrC = ArrowConfusion(G0, G) + stats[2].append(ArrC.get_arrows_precision()) + stats[3].append(ArrC.get_arrows_recall()) + + print( + [ + ["AP", "AR", "AHP", "AHR"][i] + + ": " + + str(round(stats[i][-1], 2)) + for i in range(4) + ] + ) + + print( + "\nOverall Stats: \n", + [ + ["AP", "AR", "AHP", "AHR"][i] + + ": " + + str(round(np.sum(stats[i]) / reps, 2)) + for i in range(4) + ], + ) + + print("finish") diff --git a/tests/TestGRaSP.py b/tests/TestGRaSP.py index 7482dcee..f1fe0dd0 100644 --- a/tests/TestGRaSP.py +++ b/tests/TestGRaSP.py @@ -7,61 +7,45 @@ from causallearn.graph.GraphNode import GraphNode from causallearn.search.PermutationBased.GRaSP import grasp from causallearn.utils.DAG2CPDAG import dag2cpdag -from causallearn.utils.GraphUtils import GraphUtils -import matplotlib.image as mpimg -import matplotlib.pyplot as plt -import io +import gc -def random_dag(p, d): + +def simulate_data(p, d, n): """ - Randomly generates an Erdos-Renyi direct acyclic graph given an ordering. + Randomly generates an Erdos-Renyi direct acyclic graph given an ordering + and randomly simulates data with the provided parameters. p = |variables| d = |edges| / |possible edges| + n = sample size """ # npe = |possible edges| pe = int(p * (p - 1) / 2) - # e = |edges| + # ne = |edges| ne = int(d * pe) # generate edges - e = np.append(np.zeros(pe - ne, "uint8"), np.ones(ne, "uint8")) + e = np.append(np.zeros(pe - ne), 0.5 * np.random.uniform(-1, 1, ne)) np.random.shuffle(e) - - # generate graph - g = np.zeros([p, p], "uint8") - g.T[np.triu_indices(p, 1)] = e - - return g - - -def parameterize_dag(g): - """ - Randomly parameterize a directed acyclic graph. - - g = directed acyclic graph (adjacency matrix) - """ - - # p = |variables| - p = g.shape[0] - - # e = |edges| - e = np.sum(g) + B = np.zeros([p, p]) + B.T[np.triu_indices(p, 1)] = e # generate variance terms - o = np.diag(np.ones(p)) + O = np.diag(np.ones(p)) - # generate edge weights (edge parameters uniformly sampled [-1.0, 1.0]) - b = np.zeros([p, p]) - b[np.where(g == 1)] = np.random.uniform(-1, 1, e) + # simulate data + X = np.random.multivariate_normal(np.zeros(p), O, n) + for i in range(p): + J = np.where(B[i])[0] + for j in J: X[:, i] += B[i, j] * X[:, j] - # calculate covariance - s = np.dot(np.dot(np.linalg.inv(np.eye(p) - b), o), np.linalg.inv(np.eye(p) - b).T) + pi = [i for i in range(p)] + np.random.shuffle(pi) - return s + return (B != 0)[pi][:, pi], X[:, pi] class TestGRaSP(unittest.TestCase): @@ -71,25 +55,22 @@ def test_grasp(self): n = 1000 reps = 5 + gc.set_threshold(20000, 50, 50) + # gc.set_debug(gc.DEBUG_STATS) + for p in ps: for d in ds: stats = [[], [], [], []] for rep in range(1, reps + 1): - g0 = random_dag(p, d) + g0, X = simulate_data(p, d, n) print( - "\nNodes:", - p, - "| Edges:", - np.sum(g0), - "| Avg Degree:", - 2 * np.sum(g0) / p, - "| Rep:", - rep, + "\nNodes:", p, + "| Edges:", np.sum(g0), + "| Avg Degree:", 2 * round(np.sum(g0) / p, 2), + "| Rep:", rep, ) - cov = parameterize_dag(g0) - X = np.random.multivariate_normal(np.zeros(p), cov, n) - node_names = [("x%d" % i) for i in range(p)] + node_names = [("X%d" % (i + 1)) for i in range(p)] nodes = [] for name in node_names: @@ -103,15 +84,8 @@ def test_grasp(self): G0 = dag2cpdag(G0) - G = grasp(X) - - pyd = GraphUtils.to_pydot(G) - tmp_png = pyd.create_png(f="png") - fp = io.BytesIO(tmp_png) - img = mpimg.imread(fp, format='png') - plt.axis('off') - plt.imshow(img) - plt.show() + G = grasp(X, depth=1, parameters={'lambda_value': 4}) + gc.collect() AdjC = AdjacencyConfusion(G0, G) stats[0].append(AdjC.get_adj_precision())