Skip to content

Commit

Permalink
dk-series distance (#243)
Browse files Browse the repository at this point in the history
* dk2-series distance

First pass at the dk-series (2k-series) distance. This is essentially the `DegreeDivergence`, but instead of the degree distribution it's the distribution of edges between degree-labelled
nodes.
  • Loading branch information
sdmccabe authored and leotrs committed Aug 13, 2019
1 parent f049318 commit 4d9c927
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 2 deletions.
4 changes: 2 additions & 2 deletions netrd/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .quantum_jsd import QuantumJSD
from .communicability_jsd import CommunicabilityJSD
from .distributional_nbd import DistributionalNBD
from .dk_series import dkSeries

nbd = False
try:
Expand All @@ -26,8 +27,6 @@
pass


# from .dk2_distance import dK2Distance

__all__ = [
'Hamming',
'Frobenius',
Expand All @@ -46,6 +45,7 @@
'QuantumJSD',
'CommunicabilityJSD',
'DistributionalNBD',
'dkSeries',
]

if nbd:
Expand Down
141 changes: 141 additions & 0 deletions netrd/distance/dk_series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""
dk_series.py
--------------------------
Graph distance based on the dk-series.
author: Brennan Klein & Stefan McCabe
email: [email protected]
Submitted as part of the 2019 NetSI Collabathon.
"""


import networkx as nx
import numpy as np
from scipy.sparse import coo_matrix
import itertools as it
from collections import defaultdict
from .base import BaseDistance
from ..utilities import entropy, ensure_undirected


class dkSeries(BaseDistance):
def dist(self, G1, G2, d=2):
r"""Compute the distance between two graphs by using the Jensen-Shannon
divergence between the :math:`dk`-series of the graphs.
The :math:`dk`-series of a graph is the collection of distributions of
size :math:`d` subgraphs, where nodes are labelled by degrees. For
simplicity, we currently consider only the :math:`1k`-series, i.e., the
degree distribution, or the :math:`2k`-series, i.e., the
distribution of edges between nodes of degree :math:`(k_i, k_j)`. The
distance between these :math:`dk`-series is calculated using the
Jensen-Shannon divergence.
Parameters
----------
G1, G2 (nx.Graph)
two networkx graphs to be compared
d (int)
the size of the subgraph to consider
Returns
-------
dist (float)
the distance between `G1` and `G2`.
References
----------
.. [1] Orsini, Chiara, Marija M. Dankulov, Pol Colomer-de-Simón,
Almerima Jamakovic, Priya Mahadevan, Amin Vahdat, Kevin E.
Bassler, et al. 2015. “Quantifying Randomness in Real Networks.”
Nature Communications 6 (1). https://doi.org/10.1038/ncomms9627.
"""

G1 = ensure_undirected(G1)
G2 = ensure_undirected(G2)
N = max(len(G1), len(G2))

if d == 1:
from .degree_divergence import DegreeDivergence

degdiv = DegreeDivergence()
dist = degdiv.dist()

# the 2k-distance stores the distribution in a sparse matrix,
# so here we take the output of DegreeDivergence and
# produce a comparable object
hist1, hist2 = degdiv.results['degree_histograms']
hist1 /= len(G1)
hist2 /= len(G2)
hist1 = coo_matrix(hist1)
hist2 = coo_matrix(hist2)

self.results["dk_distributions"] = hist1, hist2

elif d == 2:
D1 = dk2_series(G1, N)
D2 = dk2_series(G2, N)

# store the 2K-distributions
self.results["dk_distributions"] = D1, D2

# flatten matrices. this is safe because we've padded to the same size
G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten()
G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten()

assert np.isclose(G1_dk_normed.sum(), 1)
assert np.isclose(G2_dk_normed.sum(), 1)

dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed)
else:
raise NotImplementedError()

self.results["dist"] = dist
return dist


def dk2_series(G, N=None):
"""
Calculate the 2k-series (i.e. the number of edges between
degree-labelled nodes) for G.
"""

if N is None:
N = len(G)

k_dict = dict(nx.degree(G))
dk2 = defaultdict(int)

for (i, j) in G.edges:
k_i = k_dict[i]
k_j = k_dict[j]

# We're enforcing order here because at the end we're going to
# leverage that all the information can be stored in the upper
# triangular for convenience.
if k_i <= k_j:
dk2[(k_i, k_j)] += 1
else:
dk2[(k_j, k_i)] += 1

# every edge should be counted once
assert sum(list(dk2.values())) == G.size()

# convert from dict to sparse matrix
row = [i for (i, j) in dk2.keys()]
col = [j for (i, j) in dk2.keys()]
data = [x for x in dk2.values()]

D = coo_matrix((data, (row, col)), shape=(N, N))

# this should be normalized by the number of edges
D = D / G.size()

return D

0 comments on commit 4d9c927

Please sign in to comment.