diff --git a/causallearn/search/HiddenCausal/GIN/FisherTest.py b/causallearn/search/HiddenCausal/GIN/FisherTest.py new file mode 100644 index 00000000..46b9893b --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/FisherTest.py @@ -0,0 +1,21 @@ +import math +from scipy.stats import chi2 + +def FisherTest(pvals, alpha = 0.01): + Fisher_Stat = 0 + L = len(pvals) + for i in range(0,L): + if pvals[i] == 0: + TP = 1e-05 + else: + TP = pvals[i] + + Fisher_Stat = Fisher_Stat - 2*math.log(TP) + + Fisher_pval = 1 - chi2.cdf(Fisher_Stat, 2*L) + + if Fisher_pval >alpha: + return True, Fisher_pval + else: + return False, Fisher_pval + diff --git a/causallearn/search/HiddenCausal/GIN/GIN.py b/causallearn/search/HiddenCausal/GIN/GIN.py index 17f31b20..f0a762a3 100644 --- a/causallearn/search/HiddenCausal/GIN/GIN.py +++ b/causallearn/search/HiddenCausal/GIN/GIN.py @@ -1,357 +1,75 @@ -from collections import deque -from itertools import combinations - -import numpy as np -from scipy.stats import chi2 - -from causallearn.graph.GeneralGraph import GeneralGraph -from causallearn.graph.GraphNode import GraphNode -from causallearn.graph.NodeType import NodeType -from causallearn.graph.Edge import Edge -from causallearn.graph.Endpoint import Endpoint -from causallearn.search.FCMBased.lingam.hsic import hsic_test_gamma -from causallearn.utils.KCI.KCI import KCI_UInd - - -def fisher_test(pvals): - pvals = [pval if pval >= 1e-5 else 1e-5 for pval in pvals] - fisher_stat = -2.0 * np.sum(np.log(pvals)) - return 1 - chi2.cdf(fisher_stat, 2 * len(pvals)) - - -def GIN(data, indep_test_method='kci', alpha=0.05): - ''' - Learning causal structure of Latent Variables for Linear Non-Gaussian Latent Variable Model - with Generalized Independent Noise Condition - Parameters - ---------- - data : numpy ndarray - data set - indep_test_method : str, default='kci' - the name of the independence test being used - alpha : float, default=0.05 - desired significance level of independence tests (p_value) in (0,1) - Returns - ------- - G : general graph - causal graph - causal_order : list - causal order - ''' - n = data.shape[1] - cov = np.cov(data.T) - - if indep_test_method == 'kci': - kci = KCI_UInd() - - if indep_test_method not in ['kci', 'hsic']: - raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) - - def indep_test(x, y, method): - if method == 'kci': - return kci.compute_pvalue(x, y)[0] - elif method == 'hsic': - return hsic_test_gamma(x, y)[1] - else: - raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) - - var_set = set(range(n)) - cluster_size = 2 - clusters_list = [] - while cluster_size < len(var_set): - tmp_clusters_list = [] - for cluster in combinations(var_set, cluster_size): - remain_var_set = var_set - set(cluster) - e = cal_e_with_gin(data, cov, list(cluster), list(remain_var_set)) - pvals = [] - for z in range(len(remain_var_set)): - pvals.append(indep_test(data[:, [z]], e[:, None], method=indep_test_method)) - fisher_pval = fisher_test(pvals) - if fisher_pval >= alpha: - tmp_clusters_list.append(cluster) - tmp_clusters_list = merge_overlaping_cluster(tmp_clusters_list) - clusters_list = clusters_list + tmp_clusters_list - for cluster in tmp_clusters_list: - var_set -= set(cluster) - cluster_size += 1 - - causal_order = [] # this variable corresponds to K in paper - updated = True - while updated: - updated = False - X = [] - Z = [] - for cluster_k in causal_order: - cluster_k1, cluster_k2 = array_split(cluster_k, 2) - X += cluster_k1 - Z += cluster_k2 - - for i, cluster_i in enumerate(clusters_list): - is_root = True - cluster_i1, cluster_i2 = array_split(cluster_i, 2) - for j, cluster_j in enumerate(clusters_list): - if i == j: - continue - cluster_j1, cluster_j2 = array_split(cluster_j, 2) - e = cal_e_with_gin(data, cov, X + cluster_i1 + cluster_j1, Z + cluster_i2) - pvals = [] - for z in range(len(Z + cluster_i2)): - pvals.append(indep_test(data[:, [z]], e[:, None], method=indep_test_method)) - fisher_pval = fisher_test(pvals) - if fisher_pval < alpha: - is_root = False - break - if is_root: - causal_order.append(cluster_i) - clusters_list.remove(cluster_i) - updated = True - break - - G = GeneralGraph([]) - for var in var_set: - o_node = GraphNode(f"X{var + 1}") - G.add_node(o_node) - - latent_id = 1 - l_nodes = [] - - for cluster in causal_order: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - G.add_node(l_node) - for l in l_nodes: - G.add_directed_edge(l, l_node) - l_nodes.append(l_node) - - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - - undirected_l_nodes = [] - - for cluster in clusters_list: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - G.add_node(l_node) - for l in l_nodes: - G.add_directed_edge(l, l_node) - - for l in undirected_l_nodes: - G.add_edge(Edge(l, l_node, Endpoint.TAIL, Endpoint.TAIL)) - - undirected_l_nodes.append(l_node) - - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - - return G, causal_order - - -def GIN_MI(data): - ''' - Learning causal structure of Latent Variables for Linear Non-Gaussian Latent Variable Model - with Generalized Independent Noise Condition - - Parameters - ---------- - data : numpy ndarray - data set - - Returns - ------- - G : general graph - causal graph - causal_order : list - causal order - ''' - v_labels = list(range(data.shape[1])) - v_set = set(v_labels) - cov = np.cov(data.T) - - # Step 1: Finding Causal Clusters - cluster_list = [] - min_cluster = {i: set() for i in v_set} - min_dep_score = {i: 1e9 for i in v_set} - for (x1, x2) in combinations(v_set, 2): - x_set = {x1, x2} - z_set = v_set - x_set - dep_statistic = cal_dep_for_gin(data, cov, list(x_set), list(z_set)) - for i in x_set: - if min_dep_score[i] > dep_statistic: - min_dep_score[i] = dep_statistic - min_cluster[i] = x_set - for i in v_labels: - cluster_list.append(list(min_cluster[i])) - - cluster_list = merge_overlaping_cluster(cluster_list) - - # Step 2: Learning the Causal Order of Latent Variables - causal_order = [] # this variable corresponds to K in paper - while (len(cluster_list) != 0): - root = find_root(data, cov, cluster_list, causal_order) - causal_order.append(root) - cluster_list.remove(root) - - latent_id = 1 - l_nodes = [] - G = GeneralGraph([]) - for cluster in causal_order: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - l_nodes.append(l_node) - G.add_node(l_node) - for l in l_nodes: - if l != l_node: - G.add_directed_edge(l, l_node) - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - - return G, causal_order - - -def cal_e_with_gin(data, cov, X, Z): - cov_m = cov[np.ix_(Z, X)] - _, _, v = np.linalg.svd(cov_m) - omega = v.T[:, -1] - return np.dot(data[:, X], omega) - - -def cal_dep_for_gin(data, cov, X, Z): - ''' - Calculate the statistics of dependence via Generalized Independent Noise Condition - - Parameters - ---------- - data : data set (numpy ndarray) - cov : covariance matrix - X : test set variables - Z : condition set variables - - Returns - ------- - sta : test statistic - ''' - - e_xz = cal_e_with_gin(data, cov, X, Z) - - sta = 0 - for i in Z: - sta += hsic_test_gamma(e_xz, data[:, i])[0] - sta /= len(Z) - return sta - - -def find_root(data, cov, clusters, causal_order): - ''' - Find the causal order by statistics of dependence - Parameters - ---------- - data : data set (numpy ndarray) - cov : covariance matrix - clusters : clusters of observed variables - causal_order : causal order - Returns - ------- - root : latent root cause - ''' - if len(clusters) == 1: - return clusters[0] - root = clusters[0] - dep_statistic_score = 1e30 - for i in clusters: - for j in clusters: - if i == j: - continue - X = [i[0], j[0]] - Z = [] - for k in range(1, len(i)): - Z.append(i[k]) - - if causal_order: - for k in causal_order: - X.append(k[0]) - Z.append(k[1]) - - dep_statistic = cal_dep_for_gin(data, cov, X, Z) - if dep_statistic < dep_statistic_score: - dep_statistic_score = dep_statistic - root = i - - return root - - -def _get_all_elements(S): - result = set() - for i in S: - for j in i: - result |= {j} - return result - - -# merging cluster -def merge_overlaping_cluster(cluster_list): - v_labels = _get_all_elements(cluster_list) - if len(v_labels) == 0: - return [] - cluster_dict = {i: -1 for i in v_labels} - cluster_b = {i: [] for i in v_labels} - cluster_len = 0 - for i in range(len(cluster_list)): - for j in cluster_list[i]: - cluster_b[j].append(i) - - visited = [False] * len(cluster_list) - cont = True - while cont: - cont = False - q = deque() - for i, val in enumerate(visited): - if not val: - q.append(i) - visited[i] = True - break - while q: - top = q.popleft() - for i in cluster_list[top]: - cluster_dict[i] = cluster_len - for j in cluster_b[i]: - if not visited[j]: - q.append(j) - visited[j] = True - - for i in visited: - if not i: - cont = True - break - cluster_len += 1 - - cluster = [[] for _ in range(cluster_len)] - for i in v_labels: - cluster[cluster_dict[i]].append(i) - - return cluster - - -def array_split(x, k): - x_len = len(x) - # div_points = [] - sub_arys = [] - start = 0 - section_len = x_len // k - extra = x_len % k - for i in range(extra): - sub_arys.append(x[start:start + section_len + 1]) - start = start + section_len + 1 - - for i in range(k - extra): - sub_arys.append(x[start:start + section_len]) - start = start + section_len - return sub_arys \ No newline at end of file +from causallearn.graph.GeneralGraph import GeneralGraph +from causallearn.graph.GraphNode import GraphNode +from causallearn.graph.NodeType import NodeType +from causallearn.search.FCMBased.lingam.hsic import hsic_test_gamma +from causallearn.utils.KCI.KCI import KCI_UInd + +import causallearn.search.HiddenCausal.GIN.Infer_Causal_Order as ICO +import causallearn.search.HiddenCausal.GIN.Identifying_Causal_Clusters as ICC + + + +def GIN(data, indep_test_method='kci', alpha=0.05, MAX_Factor = 2): + ''' + Learning causal structure of Latent Variables for Linear Non-Gaussian Latent Variable Model + with Generalized Independent Noise Condition + Parameters + ---------- + data : numpy ndarray + data set + indep_test_method : str, default='kci' + the name of the independence test being used + alpha : float, default=0.05 + desired significance level of independence tests (p_value) in (0,1) + MAX_Factor : int + Prior maximum factor number + Returns + ------- + G : general graph + causal graph + causal_order : list + causal order + ''' + if indep_test_method not in ['kci', 'hsic']: + raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) + + if indep_test_method == 'kci': + kci = KCI_UInd() + + if indep_test_method == 'kci': + def indep_test(x, y): + return kci.compute_pvalue(x, y)[0] + elif indep_test_method == 'hsic': + def indep_test(x, y): + return hsic_test_gamma(x, y)[1] + else: + raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) + + + '''identifying causal cluster by using HSIC (KCI)''' + Cluster=ICC.FindCluser(data, indep_test, alpha, MAX_Factor) + '''identifying causal order by using mutual information, (k nearest neighbors )''' + CausalOrder=ICO.LearnCausalOrder(Cluster, data) + + + latent_id = 1 + l_nodes = [] + G = GeneralGraph([]) + for cluster in CausalOrder: + l_node = GraphNode(f"L{latent_id}") + l_node.set_node_type(NodeType.LATENT) + l_nodes.append(l_node) + G.add_node(l_node) + for l in l_nodes: + if l != l_node: + G.add_directed_edge(l, l_node) + for o in cluster: + o_node = GraphNode(f"X{o + 1}") + G.add_node(o_node) + G.add_directed_edge(l_node, o_node) + latent_id += 1 + + return G, CausalOrder + + + diff --git a/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py new file mode 100644 index 00000000..d7f8170f --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py @@ -0,0 +1,72 @@ +import causallearn.search.HiddenCausal.GIN.Test_GIN_Condition as Test_GIN_Condition #based on hsic to find cluster +import causallearn.search.HiddenCausal.GIN.Merge_Cluster as Merge_Cluster #overlap merge utils +import itertools + + +def FindCluser(data, test_function, alpha=0.05, Max_Factor = 2): + ''' + Learning causal Cluster for Linear Non-Gaussian Latent Variable Model with Generalized Independent Noise Condition + Parameters + ---------- + data : numpy ndarray + data set + indep_test_method : str, default='kci' + the name of the independence test being used + alpha : float, default=0.05 + desired significance level of independence tests (p_value) in (0,1) + Max_Factor: int + Maximum number of factors + Returns + ------- + Cluster: dic + Causal Cluster, e.g., {'1':[['x1','x2']],'2':[['x4','x5','x6']} + ''' + #Initialize variable set + indexs = list(range(data.shape[1])) + B = indexs.copy() + Cluster = {} + Grlen = 2 + + while len(B) >= Grlen and len(indexs) >=2*Grlen-1: + LatentNum = Grlen-1 + Set_P = itertools.combinations(B, Grlen) + print('Identifying causal cluster with '+str(LatentNum)+'-factor model:') + for P in Set_P: + tind = indexs.copy() + for t in P: + tind.remove(t) # tind= ALLdata\P + if GIN(list(P), tind, data, test_function, alpha): + key = list(Cluster.keys()) + if (LatentNum) in key: + temp = Cluster[LatentNum] + temp.append(list(P)) + Cluster[LatentNum] = temp + else: + Cluster[LatentNum] = [list(P)] + + print('Debug------------',Cluster) + # Merge overlap cluster and update dataset + key = Cluster.keys() + if LatentNum in key: + Tclu = Merge_Cluster.merge_list(Cluster[LatentNum]) + Cluster[LatentNum] = Tclu + for i in Tclu: + for j in i: + if j in B: + B.remove(j) + + Grlen += 1 + print('The identified cluster for '+ str(LatentNum)+ '-factor model:', Cluster) + + if Max_Factor !=0 and (Grlen-1)>Max_Factor: + break + + return Cluster + + + +def GIN(X, Z, data, test_function, alpha=0.05): + return Test_GIN_Condition.GIN(X, Z, data, test_function, alpha) + #Fisher method + #return Test_GIN_Condition.FisherGIN(X, Z, data, test_function, alpha) + diff --git a/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py new file mode 100644 index 00000000..cc3813d8 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py @@ -0,0 +1,101 @@ +import causallearn.search.HiddenCausal.GIN.Test_GIN_Condition as GIN #GIN based on mutual information + + + +def LearnCausalOrder(cluster, data): + ''' + Learning causal order for Linear Non-Gaussian Latent Variable Model with Generalized Independent Noise Condition + Parameters + ---------- + cluster: dic + causal cluster learned by phase I + data : numpy ndarray + data set + Returns + ------- + causal_order : list + causal order + ''' + Cluster = GetCluster(cluster) + K = [] # Initize causal order + while(len(Cluster) >0): + root = FindRoot(Cluster, data, K) + K.append(root) + Cluster.remove(root) + return K + + + + +def FindRoot(clusters, data, causal_order): + ''' + Find the causal order by statistics of dependence + Parameters + ---------- + data : data set (numpy ndarray) + cov : covariance matrix + clusters : clusters of observed variables + causal_order : causal order + Returns + ------- + root : latent root cause + ''' + if len(clusters) == 1: + return clusters[0] + root = clusters[0] + MI_lists=[] + for i in clusters: + MI=0 + for j in clusters: + if i == j: + continue + + X = [j[0]] + Z = [] + r_1, r_2 = array_split(i, 2) + X = X + r_1 + Z = Z + r_2 + + if causal_order: + for k in causal_order: + k_1, k_2 = array_split(k, 2) + X = X + k_1 + Z = Z + k_2 + + print('Debug as Mutual Information: ', X , Z) + tmi = GIN.GIN_MI(X, Z, data) + MI+=tmi + + MI_lists.append(MI) + print('Debug as Mutual Information: (All results of MI)', MI_lists) + + mins=MI_lists.index(min(MI_lists)) + root = clusters[mins] + return root + + + +def GetCluster(cluster): + Clu = [] + key = cluster.keys() + for i in key: + C = cluster[i] + for j in C: + Clu.append(j) + return Clu + +def array_split(x, k): + x_len = len(x) + # div_points = [] + sub_arys = [] + start = 0 + section_len = x_len // k + extra = x_len % k + for i in range(extra): + sub_arys.append(x[start:start + section_len + 1]) + start = start + section_len + 1 + + for i in range(k - extra): + sub_arys.append(x[start:start + section_len]) + start = start + section_len + return sub_arys \ No newline at end of file diff --git a/causallearn/search/HiddenCausal/GIN/Merge_Cluster.py b/causallearn/search/HiddenCausal/GIN/Merge_Cluster.py new file mode 100644 index 00000000..37242783 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Merge_Cluster.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd +from itertools import combinations +import itertools +import networkx as nx + +'''Merge overlap causal cluster''' +def merge_list(L2): + l = L2.copy() + + edges = [] + s = list(map(set, l)) + for i, j in combinations(range(len(s)), r=2): + + if s[i].intersection(s[j]): + edges.append((i, j)) + + G = graph(edges) + + result = [] + unassigned = list(range(len(s))) + + for component in connected_components(G): + union = set().union(*(s[i] for i in component)) + result.append(sorted(union)) + unassigned = [i for i in unassigned if i not in component] + + result.extend(map(sorted, (s[i] for i in unassigned))) + + return result + + +def bfs(graph, start): + visited, queue = set(), [start] + while queue: + vertex = queue.pop(0) + if vertex not in visited: + visited.add(vertex) + queue.extend(graph[vertex] - visited) + return visited + +def connected_components(G): + seen = set() + for v in G: + if v not in seen: + c = set(bfs(G, v)) + yield c + seen.update(c) + +def graph(edge_list): + result = {} + for source, target in edge_list: + result.setdefault(source, set()).add(target) + result.setdefault(target, set()).add(source) + return result + + + + + + + diff --git a/causallearn/search/HiddenCausal/GIN/Mutual_Information.py b/causallearn/search/HiddenCausal/GIN/Mutual_Information.py new file mode 100644 index 00000000..037aa0e8 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Mutual_Information.py @@ -0,0 +1,185 @@ +''' +Non-parametric computation of entropy and mutual-information +Adapted by G Varoquaux for code created by R Brette, itself +from several papers (see in the code). +These computations rely on nearest-neighbor statistics +''' +import numpy as np +import pandas as pd +from sklearn import metrics + +from scipy.special import gamma,psi +from scipy import ndimage +from scipy.linalg import det +from numpy import pi + +from sklearn.neighbors import NearestNeighbors + +__all__=['entropy', 'mutual_information', 'entropy_gaussian'] + +EPS = np.finfo(float).eps + + +def nearest_distances(X, k=1): + ''' + X = array(N,M) + N = number of points + M = number of dimensions + returns the distance to the kth nearest neighbor for every point in X + ''' + knn = NearestNeighbors(n_neighbors=k + 1) + knn.fit(X) + d, _ = knn.kneighbors(X) # the first nearest neighbor is itself + return d[:, -1] # returns the distance to the kth nearest neighbor + + +def entropy_gaussian(C): + ''' + Entropy of a gaussian variable with covariance matrix C + ''' + if np.isscalar(C): # C is the variance + return .5*(1 + np.log(2*pi)) + .5*np.log(C) + else: + n = C.shape[0] # dimension + return .5*n*(1 + np.log(2*pi)) + .5*np.log(abs(det(C))) + + +def entropy2(u,k=1): + k1 = 79.047 + k2 = 7.4129 + gamma = 0.37457 + return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 + + +def entropy(X, k=1): + ''' Returns the entropy of the X. + Parameters + =========== + X : array-like, shape (n_samples, n_features) + The data the entropy of which is computed + k : int, optional + number of nearest neighbors for density estimation + Notes + ====== + Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy + of a random vector. Probl. Inf. Transm. 23, 95-101. + See also: Evans, D. 2008 A computationally efficient estimator for + mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. + and: + Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual + information. Phys Rev E 69(6 Pt 2):066138. + ''' + + # Distance to kth nearest neighbor + r = nearest_distances(X, k) # squared distances + n, d = X.shape + volume_unit_ball = (pi**(.5*d)) / gamma(.5*d + 1) + ''' + F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures + for Continuous Random Variables. Advances in Neural Information + Processing Systems 21 (NIPS). Vancouver (Canada), December. + return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k) + ''' + return (d*np.mean(np.log(r + np.finfo(X.dtype).eps)) + + np.log(volume_unit_ball) + psi(n) - psi(k)) + + +def mutual_information(variables, k=1): + ''' + Returns the mutual information between any number of variables. + Each variable is a matrix X = array(n_samples, n_features) + where + n = number of samples + dx,dy = number of dimensions + Optionally, the following keyword argument can be specified: + k = number of nearest neighbors for density estimation + Example: mutual_information((X, Y)), mutual_information((X, Y, Z), k=5) + ''' + if len(variables) < 2: + raise AttributeError( + "Mutual information must involve at least 2 variables") + all_vars = np.hstack(variables) + return (sum([entropy(X, k=k) for X in variables]) + - entropy(all_vars, k=k)) + + +def mutual_information_2d(x, y, sigma=1, normalized=False): + """ + Computes (normalized) mutual information between two 1D variate from a + joint histogram. + Parameters + ---------- + x : 1D array + first variable + y : 1D array + second variable + sigma: float + sigma for Gaussian smoothing of the joint histogram + Returns + ------- + nmi: float + the computed similariy measure + """ + bins = (256, 256) + + jh = np.histogram2d(x, y, bins=bins)[0] + + # smooth the jh with a gaussian filter of given sigma + ndimage.gaussian_filter(jh, sigma=sigma, mode='constant', + output=jh) + + # compute marginal histograms + jh = jh + EPS + sh = np.sum(jh) + jh = jh / sh + s1 = np.sum(jh, axis=0).reshape((-1, jh.shape[0])) + s2 = np.sum(jh, axis=1).reshape((jh.shape[1], -1)) + + # Normalised Mutual Information of: + # Studholme, jhill & jhawkes (1998). + # "A normalized entropy measure of 3-D medical image alignment". + # in Proc. Medical Imaging 1998, vol. 3338, San Diego, CA, pp. 132-143. + if normalized: + mi = ((np.sum(s1 * np.log(s1)) + np.sum(s2 * np.log(s2))) + / np.sum(jh * np.log(jh))) - 1 + else: + mi = ( np.sum(jh * np.log(jh)) - np.sum(s1 * np.log(s1)) + - np.sum(s2 * np.log(s2))) + + return mi + + +def _entropy(u): + """Calculate entropy using the maximum entropy approximations.""" + k1 = 79.047 + k2 = 7.4129 + gamma = 0.37457 + return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 + + + + + +def MI_2(x1, y1): + """estimating mutual information by sklearn""" + x = list(x1.copy()) + y = list(y1.copy()) + + result_NMI = metrics.normalized_mutual_info_score(x, y) + + return result_NMI + + +def MI_1(x1, y1): + """estimating mutual information by Non-parametric computation of entropy and mutual-information""" + x=x1.reshape(len(x1),1) + y=y1.reshape(len(y1),1) + + # set the number of neighborhood + if len(x) > 3000: + k = 15 + else: + k = 10 + + mi = mutual_information((x, y), k) + return abs(mi) \ No newline at end of file diff --git a/causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py b/causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py new file mode 100644 index 00000000..5625257b --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py @@ -0,0 +1,81 @@ +import numpy as np +import pandas as pd +import causallearn.search.HiddenCausal.GIN.FisherTest as FisherTest +import causallearn.search.HiddenCausal.GIN.Mutual_Information as ID + + + +def GIN(X, Z, data, test_function, alpha = 0.05): + ''' + Generalized Independent Noise Condition Test + Parameters + ---------- + data : numpy ndarray + data set + X, Z : list + + test_function : str, default='kci' + the name of the independence test being used + alpha : float, default=0.05 + desired significance level of independence tests (p_value) in (0,1) + Returns + ------- + Boolean : True or False + ''' + omega = getomega(data, X, Z) + tdata = data[:,X] + result = np.dot(tdata, omega)[:,None] + for i in Z: + temp = np.array(data[:, [i]]) + pval = test_function(result, temp) + if pval > alpha: + flag = True + else: + flag = False + + if not flag: + return False + + return True + + +def GIN_MI(X, Z, data): + """Method : Calculate mutual information by k nearest neighbors (density estimation) """ + omega = getomega(data, X, Z) + tdata = data[:, X] + result = np.dot(tdata, omega) + MIS = 0 + for i in Z: + temp = np.array(data[:, i]) + mi = ID.MI_1(result, temp) + MIS += mi + MIS = MIS/len(Z) + + return MIS + + +def FisherGIN(X, Z, data, test_function, alpha = 0.01): + """Test GIN Condition by fisher method """ + omega = getomega(data, X, Z) + tdata = data[:, X] + result = np.dot(tdata, omega)[:,None] + pvals = [] + + for i in Z: + temp = np.array(data[:, [i]]) + pval = test_function(result, temp) + pvals.append(pval) + + flag, fisher_pval = FisherTest.FisherTest(pvals, alpha) + + return flag + +def getomega(data, X, Z): + cov = np.cov(data, rowvar=False) + cov_m = cov[np.ix_(Z, X)] + _, _, v = np.linalg.svd(cov_m) + omega = v.T[:, -1] + return omega + + + diff --git a/causallearn/search/HiddenCausal/GIN/__init__.py b/causallearn/search/HiddenCausal/GIN/__init__.py deleted file mode 100644 index e69de29b..00000000