From 5faec3330f364d487ded68c1c1cd0fd7f108268b Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Fri, 2 Aug 2019 12:13:52 -0400 Subject: [PATCH 01/12] dk2-series distance First pass at the dk-series (2k-series) distance. **Do not merge without discussion.** This is essentially the `DegreeDivergence`, but instead of the degree distribution it's the distribution of edges between degree-labelled nodes. Some outstanding questions and concerns: 1. This is not memory-efficient because it uses NxN dense matrices. 2. Have I understood the dk-series correctly? That is, does the `dk2_series` function return something meaningful? 3. I'm not sure if the dk-series is defined for directed graphs. For simplicity I have coerced to undirected graphs. --- netrd/distance/__init__.py | 3 +- netrd/distance/dk2_distance.py | 109 +++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 netrd/distance/dk2_distance.py diff --git a/netrd/distance/__init__.py b/netrd/distance/__init__.py index 16488954..dd490537 100644 --- a/netrd/distance/__init__.py +++ b/netrd/distance/__init__.py @@ -16,6 +16,7 @@ from .quantum_jsd import QuantumJSD from .communicability_jsd import CommunicabilityJSD from .distributional_nbd import DistributionalNBD +from .dk2_distance import dk2Distance nbd = False try: @@ -26,7 +27,6 @@ pass -# from .dk2_distance import dK2Distance __all__ = [ 'Hamming', @@ -46,6 +46,7 @@ 'QuantumJSD', 'CommunicabilityJSD', 'DistributionalNBD', + 'dk2Distance', ] if nbd: diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py new file mode 100644 index 00000000..08eedc56 --- /dev/null +++ b/netrd/distance/dk2_distance.py @@ -0,0 +1,109 @@ +""" +dk2_distance.py +-------------------------- + +Graph distance based on the dk-series. + +author: Brennan Klein & Stefan McCabe +email: brennanjamesklein@gmail.com +Submitted as part of the 2019 NetSI Collabathon. + +""" + + +import networkx as nx +import numpy as np +import itertools as it +from collections import defaultdict +from .base import BaseDistance +from ..utilities import entropy, ensure_undirected + + +class dk2Distance(BaseDistance): + def dist(self, G1, G2): + r"""Compute the distance between two graphs by using the Jensen-Shannon + divergence between the :math:`2k`-series of the graphs. + + The :math:`dk`-series of a graph is the collection of distributions of + size :math:`d` subgraphs, where nodes are labelled by degrees. For + simplicity, we consider only the :math:`2k`-series, i.e., the + distribution of edges between nodes of degree :math:`(k_i, k_j)`. The + distance between these :math:`2k`-series is calculated using the + Jensen-Shannon divergence. + + Parameters + ---------- + + G1, G2 (nx.Graph) + two networkx graphs to be compared + + Returns + ------- + + dist (float) + the distance between `G1` and `G2`. + + References + ---------- + + .. [1] Orsini, Chiara, Marija M. Dankulov, Pol Colomer-de-Simón, + Almerima Jamakovic, Priya Mahadevan, Amin Vahdat, Kevin E. + Bassler, et al. 2015. “Quantifying Randomness in Real Networks.” + Nature Communications 6 (1). https://doi.org/10.1038/ncomms9627. + + """ + + def dk2_series(G): + """ + Calculate the 2k-series (i.e. the number of edges between + degree-labelled edges) for G. + """ + + k_dict = dict(nx.degree(G)) + dk2 = defaultdict(int) + + for (i, j) in G.edges: + k_i = k_dict[i] + k_j = k_dict[j] + if k_i <= k_j: + dk2[(k_i, k_j)] += 1 + else: + dk2[(k_j, k_i)] += 1 + + # every edge should be counted once + assert sum(list(dk2.values())) == G.size() + + return dk2 + + G1 = ensure_undirected(G1) + G2 = ensure_undirected(G2) + + G1_dk = dk2_series(G1) + G2_dk = dk2_series(G2) + + N = max(len(G1), len(G2)) + + # note N^2 dense matrices + D1 = np.zeros((N, N)) + D2 = np.zeros((N, N)) + + for (i, j), k in G1_dk.items(): + D1[i, j] = k + for (i, j), k in G2_dk.items(): + D2[i, j] = k + + # these should be normalized by the number of edges + D1 = D1 / G1.size() + D2 = D2 / G2.size() + + # flatten matrices. this is safe because we've padded to the same size + G1_dk_normed = D1[np.triu_indices(N)].ravel() + G2_dk_normed = D2[np.triu_indices(N)].ravel() + + assert np.isclose(G1_dk_normed.sum(), 1) + assert np.isclose(G2_dk_normed.sum(), 1) + + dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed) + self.results["dist"] = dist + + return dist From 8fd998c3c73356799f865e3795f0f9fe4e6a1091 Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Fri, 2 Aug 2019 13:40:12 -0400 Subject: [PATCH 02/12] appease the autoformatter --- netrd/distance/__init__.py | 1 - netrd/distance/dk2_distance.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/netrd/distance/__init__.py b/netrd/distance/__init__.py index dd490537..5336f6b9 100644 --- a/netrd/distance/__init__.py +++ b/netrd/distance/__init__.py @@ -27,7 +27,6 @@ pass - __all__ = [ 'Hamming', 'Frobenius', diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index 08eedc56..fa884fde 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -72,7 +72,7 @@ def dk2_series(G): # every edge should be counted once assert sum(list(dk2.values())) == G.size() - + return dk2 G1 = ensure_undirected(G1) @@ -83,7 +83,7 @@ def dk2_series(G): N = max(len(G1), len(G2)) - # note N^2 dense matrices + # note N^2 dense matrices D1 = np.zeros((N, N)) D2 = np.zeros((N, N)) From 7d087bae1ee13dfb870f6ad4f6b35a57dc35160a Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 11:49:17 -0400 Subject: [PATCH 03/12] store 2k-distribution objects in dk2-dist --- netrd/distance/dk2_distance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index fa884fde..10f6efe9 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -81,6 +81,8 @@ def dk2_series(G): G1_dk = dk2_series(G1) G2_dk = dk2_series(G2) + self.results["dk_distributions"] = G1_dk, G2_dk + N = max(len(G1), len(G2)) # note N^2 dense matrices From f9d345af77daad6f897f2c85ef338867e8875b63 Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 11:49:48 -0400 Subject: [PATCH 04/12] add a few more clarifying comments about the dk2 logic --- netrd/distance/dk2_distance.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index 10f6efe9..ead7d1ff 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -65,6 +65,10 @@ def dk2_series(G): for (i, j) in G.edges: k_i = k_dict[i] k_j = k_dict[j] + + # We're enforcing order here because at the end we're going to + # leverage that all the information can be stored in the upper + # triangular for convenience. if k_i <= k_j: dk2[(k_i, k_j)] += 1 else: @@ -81,6 +85,10 @@ def dk2_series(G): G1_dk = dk2_series(G1) G2_dk = dk2_series(G2) + # store the 2K-distributions + # We're storing here instead of later because the dict representations + # are more efficient than the following dense matrix representations, + # and the matrix representation can be easily obtained from the dict. self.results["dk_distributions"] = G1_dk, G2_dk N = max(len(G1), len(G2)) From 7f2d5e3cc1c6393f37126e3107fc8b5e20afa76e Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 12:34:36 -0400 Subject: [PATCH 05/12] move dk2_series function out of class definition --- netrd/distance/dk2_distance.py | 52 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index ead7d1ff..c6b1748b 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -53,32 +53,6 @@ def dist(self, G1, G2): """ - def dk2_series(G): - """ - Calculate the 2k-series (i.e. the number of edges between - degree-labelled edges) for G. - """ - - k_dict = dict(nx.degree(G)) - dk2 = defaultdict(int) - - for (i, j) in G.edges: - k_i = k_dict[i] - k_j = k_dict[j] - - # We're enforcing order here because at the end we're going to - # leverage that all the information can be stored in the upper - # triangular for convenience. - if k_i <= k_j: - dk2[(k_i, k_j)] += 1 - else: - dk2[(k_j, k_i)] += 1 - - # every edge should be counted once - assert sum(list(dk2.values())) == G.size() - - return dk2 - G1 = ensure_undirected(G1) G2 = ensure_undirected(G2) @@ -117,3 +91,29 @@ def dk2_series(G): self.results["dist"] = dist return dist + +def dk2_series(G): + """ + Calculate the 2k-series (i.e. the number of edges between + degree-labelled edges) for G. + """ + + k_dict = dict(nx.degree(G)) + dk2 = defaultdict(int) + + for (i, j) in G.edges: + k_i = k_dict[i] + k_j = k_dict[j] + + # We're enforcing order here because at the end we're going to + # leverage that all the information can be stored in the upper + # triangular for convenience. + if k_i <= k_j: + dk2[(k_i, k_j)] += 1 + else: + dk2[(k_j, k_i)] += 1 + + # every edge should be counted once + assert sum(list(dk2.values())) == G.size() + + return dk2 From ea29f04b020344f3113db8cd1bb095b76be06cee Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 12:35:03 -0400 Subject: [PATCH 06/12] use sparse matrices instead of dense matrices --- netrd/distance/dk2_distance.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index c6b1748b..2a6c2693 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -13,6 +13,7 @@ import networkx as nx import numpy as np +from scipy.sparse import dok_matrix import itertools as it from collections import defaultdict from .base import BaseDistance @@ -61,15 +62,14 @@ def dist(self, G1, G2): # store the 2K-distributions # We're storing here instead of later because the dict representations - # are more efficient than the following dense matrix representations, + # are more convenient than the following sparse matrix representations, # and the matrix representation can be easily obtained from the dict. self.results["dk_distributions"] = G1_dk, G2_dk N = max(len(G1), len(G2)) - # note N^2 dense matrices - D1 = np.zeros((N, N)) - D2 = np.zeros((N, N)) + D1 = dok_matrix((N, N)) + D2 = dok_matrix((N, N)) for (i, j), k in G1_dk.items(): D1[i, j] = k @@ -81,8 +81,8 @@ def dist(self, G1, G2): D2 = D2 / G2.size() # flatten matrices. this is safe because we've padded to the same size - G1_dk_normed = D1[np.triu_indices(N)].ravel() - G2_dk_normed = D2[np.triu_indices(N)].ravel() + G1_dk_normed = D1[np.triu_indices(N)].toarray().flatten() + G2_dk_normed = D2[np.triu_indices(N)].toarray().flatten() assert np.isclose(G1_dk_normed.sum(), 1) assert np.isclose(G2_dk_normed.sum(), 1) From d10d3e54dd623d58d85b09ad35e8049089daaccf Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 12:35:29 -0400 Subject: [PATCH 07/12] more autoformatting --- netrd/distance/dk2_distance.py | 1 + 1 file changed, 1 insertion(+) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index 2a6c2693..ca513216 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -92,6 +92,7 @@ def dist(self, G1, G2): return dist + def dk2_series(G): """ Calculate the 2k-series (i.e. the number of edges between From 6f074c2946943b3725958a8798ec3bda084a41be Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 16:08:37 -0400 Subject: [PATCH 08/12] correct doc error --- netrd/distance/dk2_distance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index ca513216..8347c42c 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -96,7 +96,7 @@ def dist(self, G1, G2): def dk2_series(G): """ Calculate the 2k-series (i.e. the number of edges between - degree-labelled edges) for G. + degree-labelled nodes) for G. """ k_dict = dict(nx.degree(G)) From ef4c55c2015e868997bbcf4bd1695d0bd75514a5 Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 16:09:08 -0400 Subject: [PATCH 09/12] allow arbitrary in dk2_series --- netrd/distance/dk2_distance.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index 8347c42c..3f2f172a 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -93,12 +93,15 @@ def dist(self, G1, G2): return dist -def dk2_series(G): +def dk2_series(G, N=None): """ Calculate the 2k-series (i.e. the number of edges between degree-labelled nodes) for G. """ + if N is None: + N = len(G) + k_dict = dict(nx.degree(G)) dk2 = defaultdict(int) From 9785e7d433839900d3a52349c23be9bd9a1b5bb0 Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 16:11:32 -0400 Subject: [PATCH 10/12] use coo instead of dok sparse matrix --- netrd/distance/dk2_distance.py | 42 +++++++++++++++------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk2_distance.py index 3f2f172a..20af80d7 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk2_distance.py @@ -13,7 +13,7 @@ import networkx as nx import numpy as np -from scipy.sparse import dok_matrix +from scipy.sparse import coo_matrix import itertools as it from collections import defaultdict from .base import BaseDistance @@ -56,33 +56,17 @@ def dist(self, G1, G2): G1 = ensure_undirected(G1) G2 = ensure_undirected(G2) - - G1_dk = dk2_series(G1) - G2_dk = dk2_series(G2) - - # store the 2K-distributions - # We're storing here instead of later because the dict representations - # are more convenient than the following sparse matrix representations, - # and the matrix representation can be easily obtained from the dict. - self.results["dk_distributions"] = G1_dk, G2_dk - N = max(len(G1), len(G2)) - D1 = dok_matrix((N, N)) - D2 = dok_matrix((N, N)) - - for (i, j), k in G1_dk.items(): - D1[i, j] = k - for (i, j), k in G2_dk.items(): - D2[i, j] = k + D1 = dk2_series(G1, N) + D2 = dk2_series(G2, N) - # these should be normalized by the number of edges - D1 = D1 / G1.size() - D2 = D2 / G2.size() + # store the 2K-distributions + self.results["dk_distributions"] = D1, D2 # flatten matrices. this is safe because we've padded to the same size - G1_dk_normed = D1[np.triu_indices(N)].toarray().flatten() - G2_dk_normed = D2[np.triu_indices(N)].toarray().flatten() + G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten() + G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten() assert np.isclose(G1_dk_normed.sum(), 1) assert np.isclose(G2_dk_normed.sum(), 1) @@ -120,4 +104,14 @@ def dk2_series(G, N=None): # every edge should be counted once assert sum(list(dk2.values())) == G.size() - return dk2 + # convert from dict to sparse matrix + row = [i for (i, j) in dk2.keys()] + col = [j for (i, j) in dk2.keys()] + data = [x for x in dk2.values()] + + D = coo_matrix((data, (row, col)), shape=(N, N)) + + # this should be normalized by the number of edges + D = D / G.size() + + return D From c7ca20e623e5b878ccec7917f2a33575f59a4ca0 Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 18:30:31 -0400 Subject: [PATCH 11/12] dk2 distance -> dk series --- netrd/distance/__init__.py | 2 +- .../{dk2_distance.py => dk_series.py} | 58 +++++++++++++------ 2 files changed, 42 insertions(+), 18 deletions(-) rename netrd/distance/{dk2_distance.py => dk_series.py} (60%) diff --git a/netrd/distance/__init__.py b/netrd/distance/__init__.py index 5336f6b9..54e7eb6e 100644 --- a/netrd/distance/__init__.py +++ b/netrd/distance/__init__.py @@ -16,7 +16,7 @@ from .quantum_jsd import QuantumJSD from .communicability_jsd import CommunicabilityJSD from .distributional_nbd import DistributionalNBD -from .dk2_distance import dk2Distance +from .dk_series import dkSeries nbd = False try: diff --git a/netrd/distance/dk2_distance.py b/netrd/distance/dk_series.py similarity index 60% rename from netrd/distance/dk2_distance.py rename to netrd/distance/dk_series.py index 20af80d7..35f3710d 100644 --- a/netrd/distance/dk2_distance.py +++ b/netrd/distance/dk_series.py @@ -1,5 +1,5 @@ """ -dk2_distance.py +dk_series.py -------------------------- Graph distance based on the dk-series. @@ -20,16 +20,17 @@ from ..utilities import entropy, ensure_undirected -class dk2Distance(BaseDistance): - def dist(self, G1, G2): +class dkSeries(BaseDistance): + def dist(self, G1, G2, d=2): r"""Compute the distance between two graphs by using the Jensen-Shannon - divergence between the :math:`2k`-series of the graphs. + divergence between the :math:`dk`-series of the graphs. The :math:`dk`-series of a graph is the collection of distributions of size :math:`d` subgraphs, where nodes are labelled by degrees. For - simplicity, we consider only the :math:`2k`-series, i.e., the + simplicity, we currently consider only the :math:`1k`-series, i.e., the + degree distribution, or the :math:`2k`-series, i.e., the distribution of edges between nodes of degree :math:`(k_i, k_j)`. The - distance between these :math:`2k`-series is calculated using the + distance between these :math:`dk`-series is calculated using the Jensen-Shannon divergence. Parameters @@ -38,6 +39,9 @@ def dist(self, G1, G2): G1, G2 (nx.Graph) two networkx graphs to be compared + d (int) + the size of the subgraph to consider + Returns ------- @@ -58,22 +62,42 @@ def dist(self, G1, G2): G2 = ensure_undirected(G2) N = max(len(G1), len(G2)) - D1 = dk2_series(G1, N) - D2 = dk2_series(G2, N) + if d == 1: + from .degree_divergence import DegreeDivergence - # store the 2K-distributions - self.results["dk_distributions"] = D1, D2 + degdiv = DegreeDivergence() + dist = degdiv.dist() - # flatten matrices. this is safe because we've padded to the same size - G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten() - G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten() + # the 2k-distance stores the distribution in a sparse matrix, + # so here we take the output of DegreeDivergence and + # produce a comparable object + hist1, hist2 = degdiv.results['degree_histograms'] + hist1 /= len(G1) + hist2 /= len(G2) + hist1 = coo_matrix(hist1) + hist2 = coo_matrix(hist2) - assert np.isclose(G1_dk_normed.sum(), 1) - assert np.isclose(G2_dk_normed.sum(), 1) + self.results["dk_distributions"] = hist1, hist2 - dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed) - self.results["dist"] = dist + elif d == 2: + D1 = dk2_series(G1, N) + D2 = dk2_series(G2, N) + + # store the 2K-distributions + self.results["dk_distributions"] = D1, D2 + + # flatten matrices. this is safe because we've padded to the same size + G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten() + G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten() + assert np.isclose(G1_dk_normed.sum(), 1) + assert np.isclose(G2_dk_normed.sum(), 1) + + dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed) + else: + raise NotImplementedError() + + self.results["dist"] = dist return dist From 9066f14b39b63ba513f1f3045e431903c18ddf04 Mon Sep 17 00:00:00 2001 From: Stefan McCabe Date: Mon, 12 Aug 2019 20:57:27 -0400 Subject: [PATCH 12/12] rename dk series in __all__ --- netrd/distance/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netrd/distance/__init__.py b/netrd/distance/__init__.py index 54e7eb6e..0ff17c25 100644 --- a/netrd/distance/__init__.py +++ b/netrd/distance/__init__.py @@ -45,7 +45,7 @@ 'QuantumJSD', 'CommunicabilityJSD', 'DistributionalNBD', - 'dk2Distance', + 'dkSeries', ] if nbd: