From fad9671a7385149790696a1b3a3891b968e1d47e Mon Sep 17 00:00:00 2001 From: "elahe.kooshafar" Date: Tue, 5 Dec 2023 22:03:40 -0500 Subject: [PATCH 1/7] new features --- tgx/utils/newstat.py | 163 ++++++++++++++++++++++++++++++++++++ tgx/utils/plotting_utils.py | 8 +- 2 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 tgx/utils/newstat.py diff --git a/tgx/utils/newstat.py b/tgx/utils/newstat.py new file mode 100644 index 0000000..6144c92 --- /dev/null +++ b/tgx/utils/newstat.py @@ -0,0 +1,163 @@ +from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map +import networkx as nx +import numpy as np +from tgx.utils.graph_utils import train_test_split +from typing import List, Dict + +__all__ = ["connected_components_per_ts", + "size_connected_components", + "get_avg_node_engagement", + "degree_density"] + + +def degree_density(graph: object, k: int = 10, network_name: str = None, plot_path: str = None) -> None: + r""" + Plot density map of node degrees per time window + Parameters: + graph_edgelist: Dictionary containing graph data + k: number of time windows + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + graph_edgelist = graph.data + degrees_by_k_list = [] + temp = [] + temp_idx = 0 + unique_ts = list(graph_edgelist.keys()) + + for ts in unique_ts: + e_at_this_ts = graph_edgelist[ts] + G = nx.MultiGraph() + + for e in e_at_this_ts: + G.add_edge(e[0], e[1]) + + nodes = G.nodes() + degrees = [G.degree[n] for n in nodes] + + if temp_idx None: + r""" + Plot number of connected components per timestamp + Parameters: + graph: a list containing graph snapshots + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + num_components = [] + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} + + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) + + num = 0 + for u in nodes_t: + if parent[u] == u: + num += 1 + num_components.append(num) + + if network_name is not None: + filename = f"{network_name}_connected_components_per_ts" + else: + filename = "_connected_components_per_ts" + + plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) + return + + +def size_connected_components(graph: object) -> List[List]: + r""" + Calculate the sizes of connected components per timestamp + Returns: + list[list]: A list containing lists of sizes of connected components for each timestamp. + """ + component_sizes = [] + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} + + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) + + component_sizes_t = {} + for u in nodes_t: + root = _find(u, parent) + if root not in component_sizes_t: + component_sizes_t[root] = 0 + component_sizes_t[root] += 1 + + component_sizes_t_list = list(component_sizes_t.values()) + component_sizes.append(component_sizes_t_list) + + return component_sizes + + +def get_avg_node_engagement(graph: object) -> List[int]: + r""" + Calculate the average node engagement per timestamp, + the average number of distinct nodes that establish + at least one new connection. + Parameters: + graph_edgelist: Dictionary containing graph data + """ + engaging_nodes = [] + previous_edges = set() + + for ts in range(len(graph.data)): + edgelist_t = graph.data[ts] + new_nodes = set() + + for edge in edgelist_t: + (u, v) = edge + if frozenset({u, v}) not in previous_edges: + if u not in new_nodes: + new_nodes.add(u) + if v not in new_nodes: + new_nodes.add(v) + + engaging_nodes.append(len(new_nodes)) + previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp + + return engaging_nodes \ No newline at end of file diff --git a/tgx/utils/plotting_utils.py b/tgx/utils/plotting_utils.py index 377e7ad..19a7d7a 100644 --- a/tgx/utils/plotting_utils.py +++ b/tgx/utils/plotting_utils.py @@ -104,10 +104,10 @@ def plot_density_map(data, filename, y_title, plot_path = None): Plot a density map using fig and ax ''' # Create a 2D list for color values - c = np.zeros((np.max(data), len(data))) + max_value = np.max([np.max(inner) for inner in data if inner]) + c = np.zeros((max_value, len(data))) for i, row in enumerate(data): for value in row: - # print(value) c[value-1][i] += 1 # Plot @@ -115,10 +115,10 @@ def plot_density_map(data, filename, y_title, plot_path = None): ax = fig.add_subplot(111) norm = mcolors.Normalize(vmin=0, vmax=1) - cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm) + cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm, aspect='auto') cbar = fig.colorbar(cax) - ax.set_title("Heatmap of Node Degrees Over Time") + ax.set_title(f"Heatmap of {y_title} Over Time") ax.set_xlabel('Time', fontsize=20) ax.set_ylabel(y_title, fontsize=20) ax.tick_params(labelsize=20) From 8d5b3af94b6dfd7d64e4f9ab6ea857fadec9538e Mon Sep 17 00:00:00 2001 From: "elahe.kooshafar" Date: Tue, 5 Dec 2023 22:11:17 -0500 Subject: [PATCH 2/7] example for new features --- examples/newtest.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 examples/newtest.py diff --git a/examples/newtest.py b/examples/newtest.py new file mode 100644 index 0000000..d4d8b93 --- /dev/null +++ b/examples/newtest.py @@ -0,0 +1,27 @@ +import tgx +import tgx.utils.newstat as newstat +from tgx.utils.plotting_utils import plot_for_snapshots + + +plot_path = "/home/mila/e/elahe.kooshafar/projects/TGX_results" + +dataset = tgx.builtin.uci() +G = tgx.Graph(dataset) +new_G = G.discretize(time_scale="weekly") + +# Number of Connected Components +newstat.connected_components_per_ts(new_G, network_name=dataset.name, plot_path = plot_path) + +# Size of Largest Connected Component +component_sizes = newstat.size_connected_components(new_G) +largest_component_sizes = [max(inner_list) if inner_list else 0 for inner_list in component_sizes] +filename = f"{dataset.name}_largest_connected_component_size" +plot_for_snapshots(largest_component_sizes, filename, "Size of Largest Connected Component", plot_path = plot_path) + +# Average Node Engagement +engagements = newstat.get_avg_node_engagement(new_G) +filename = f"{dataset.name}_average_node_engagement" +plot_for_snapshots(engagements, filename, "Average Engagement", plot_path = plot_path) + +# Degree Density +newstat.degree_density(new_G, k=3, network_name=dataset.name, plot_path = plot_path) \ No newline at end of file From 32be52474ae9c12c54e1c17031ba33fdc33cbcf9 Mon Sep 17 00:00:00 2001 From: Elahe-ek Date: Wed, 6 Dec 2023 23:52:05 -0500 Subject: [PATCH 3/7] update new features --- README.md | 4 +- examples/data_viz.py | 59 ++++ examples/discretize_ctdg.py | 29 +- examples/newtest.py | 2 +- examples/test_paths.py | 12 +- tgx/classes/__pycache__/graph.cpython-39.pyc | Bin 5111 -> 5111 bytes tgx/classes/graph.py | 6 +- tgx/utils/graph_utils.py | 9 +- tgx/utils/newstat.py | 8 +- tgx/utils/plotting_utils.py | 8 +- tgx/utils/stat.py | 271 ++++++++++++------- tgx/viz/TEA.py | 4 +- tgx/viz/TET.py | 2 +- 13 files changed, 278 insertions(+), 136 deletions(-) create mode 100644 examples/data_viz.py diff --git a/README.md b/README.md index d55df9b..6a92f02 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +# Temporal Graph Analysis with TGX (to appear in WSDM 2024) + ### Install dependency Our implementation works with python >= 3.9 and can be installed as follows @@ -42,4 +44,4 @@ first create the branch on github git fetch origin git checkout -b test origin/test -``` \ No newline at end of file +``` diff --git a/examples/data_viz.py b/examples/data_viz.py new file mode 100644 index 0000000..3984ad6 --- /dev/null +++ b/examples/data_viz.py @@ -0,0 +1,59 @@ +import tgx +from tgx.utils.graph_utils import list2csv + +""" +1. load a dataset +2. load into a graph +3. discretize the graph +4. save the graph back to a csv +""" + +#! load the datasets +# dataset = tgx.builtin.uci() #built in datasets + +data_name = "tgbl-wiki" #"tgbl-review" +dataset = tgx.tgb_data(data_name) #tgb datasets + + +ctdg = tgx.Graph(dataset) +time_scale = "daily" +dtdg = ctdg.discretize(time_scale=time_scale) + + +#! plotting the statistics, works +tgx.degree_over_time(dtdg, network_name=data_name) +tgx.nodes_over_time(dtdg, network_name=data_name) +tgx.edges_over_time(dtdg, network_name=data_name) +tgx.nodes_and_edges_over_time(dtdg, network_name=data_name) + +tgx.TET(dtdg, + network_name=data_name, + figsize = (9, 5), + axis_title_font_size = 24, + ticks_font_size = 24) + + +tgx.TEA(dtdg, + network_name=data_name) + + + +#! compute statistics +test_ratio = 0.15 +tgx.get_reoccurrence(ctdg, test_ratio=test_ratio) +tgx.get_surprise(ctdg, test_ratio=test_ratio) + +#* these two much faster on dtdgs +tgx.get_avg_node_activity(dtdg) +tgx.get_novelty(dtdg) + + + + + +# #! statistics to be updated and fixed +# #TODO +# tgx.degree_density() +# tgx.connected_components_per_ts() +# tgx.size_connected_components() +# tgx.get_avg_node_engagement() \ No newline at end of file diff --git a/examples/discretize_ctdg.py b/examples/discretize_ctdg.py index 630d326..802b03b 100644 --- a/examples/discretize_ctdg.py +++ b/examples/discretize_ctdg.py @@ -1,4 +1,5 @@ import tgx +from tgx.utils.graph_utils import list2csv """ 1. load a dataset @@ -8,8 +9,10 @@ """ #! load the datasets -dataset = tgx.builtin.uci() -# dataset = tgx.tgb_data("tgbl-wiki") +# dataset = tgx.builtin.uci() + +data_name = "tgbl-wiki" #"tgbl-review" +dataset = tgx.tgb_data(data_name) # dataset = tgx.tgb_data("tgbl-review") # dataset = tgx.tgb_data("tgbl-coin") @@ -17,27 +20,11 @@ ctdg = tgx.Graph(dataset) # ctdg.save2csv("ctdg") -dtdg = ctdg.discretize(time_scale="hourly") -# dtdg.save2csv("dtdg_daily") -print ("discretize to hourly") -print ("there is time gap, ", dtdg.check_time_gap()) - - -dtdg = ctdg.discretize(time_scale="daily") -# dtdg.save2csv("dtdg_daily") -print ("discretize to daily") -print ("there is time gap, ", dtdg.check_time_gap()) - -dtdg = ctdg.discretize(time_scale="weekly") -# dtdg.save2csv("dtdg_weekly") -print ("discretize to weekly") -print ("there is time gap, ", dtdg.check_time_gap()) - -time_scale = "monthly" -dtdg = ctdg.discretize(time_scale=time_scale) +time_scale = "hourly" #"monthly" #"weekly" #"daily" #"hourly" #"minutely" +dtdg, ts_list = ctdg.discretize(time_scale=time_scale, store_unix=True) print ("discretize to ", time_scale) print ("there is time gap, ", dtdg.check_time_gap()) - +list2csv(ts_list, data_name + "_ts" + "_" + time_scale + ".csv") diff --git a/examples/newtest.py b/examples/newtest.py index d4d8b93..cc7aac5 100644 --- a/examples/newtest.py +++ b/examples/newtest.py @@ -7,7 +7,7 @@ dataset = tgx.builtin.uci() G = tgx.Graph(dataset) -new_G = G.discretize(time_scale="weekly") +new_G = G.discretize(time_scale="weekly")[0] # Number of Connected Components newstat.connected_components_per_ts(new_G, network_name=dataset.name, plot_path = plot_path) diff --git a/examples/test_paths.py b/examples/test_paths.py index acc20b7..1c88d8d 100644 --- a/examples/test_paths.py +++ b/examples/test_paths.py @@ -48,12 +48,12 @@ -# Load data into dictionary -G = read_csv(fname) -G.discretize(options) +# # Load data into dictionary +# G = read_csv(fname) +# G.discretize(options) -TEA(G) #create the temp dictionary here by going through the edges -TET(G) #check if the temp dict is created, if not, create it -nodes_over_time(G) +# TEA(G) #create the temp dictionary here by going through the edges +# TET(G) #check if the temp dict is created, if not, create it +# nodes_over_time(G) diff --git a/tgx/classes/__pycache__/graph.cpython-39.pyc b/tgx/classes/__pycache__/graph.cpython-39.pyc index 447eeec7180ddcdb89824d636f09b657d36486a8..5632a8e615cce273c7aece09bb28cd7f3c501dc4 100644 GIT binary patch delta 100 zcmV-q0Gt2!C-)}}VGRum00000H^Ojb#IX(O1py|L_XUyxI+Kb9-2pzcXa^Gk0YT0 GF%!mjY$6Q+ delta 100 zcmV-q0Gt2!C-)}}VGRum00000)JA1xeX$Mc1py?J_XUyxIFpJ7-2ptaXa^Gk0YtNW z2ulG0PqWAg3;_XLlhq3O0c?}c3)BH~v!o1I0Rerp{SCMQ0gSV@4}t;#nv*#a_W>M} GF%!mQl_A6c diff --git a/tgx/classes/graph.py b/tgx/classes/graph.py index a42296d..0d4ba15 100644 --- a/tgx/classes/graph.py +++ b/tgx/classes/graph.py @@ -5,7 +5,7 @@ import copy import csv - +#TODO should contain a new property tracking the number of timestamps class Graph(object): def __init__(self, dataset: Optional[object] = None, @@ -31,6 +31,8 @@ def __init__(self, self.subsampled_graph = None self.freq_data = None + + def discretize(self, @@ -51,7 +53,7 @@ def discretize(self, if (store_unix): return new_G, output[1] else: - return new_G + return (new_G, ) def count_freq(self): self.freq_data = frequency_count(self.data) diff --git a/tgx/utils/graph_utils.py b/tgx/utils/graph_utils.py index 5eb90a8..99bd4d0 100644 --- a/tgx/utils/graph_utils.py +++ b/tgx/utils/graph_utils.py @@ -272,4 +272,11 @@ def is_discretized(edgelist: Optional[dict], if len(timestamps) > max_timestamps: discretized = False - return discretized \ No newline at end of file + return discretized + +def list2csv(lst: list, + fname: str, + delimiter: str = ",", + fmt: str = '%i'): + out_list = np.array(lst) + np.savetxt(fname, out_list, delimiter=delimiter, fmt=fmt) \ No newline at end of file diff --git a/tgx/utils/newstat.py b/tgx/utils/newstat.py index 6144c92..4ce58fd 100644 --- a/tgx/utils/newstat.py +++ b/tgx/utils/newstat.py @@ -10,7 +10,7 @@ "degree_density"] -def degree_density(graph: object, k: int = 10, network_name: str = None, plot_path: str = None) -> None: +def degree_density(graph: tuple, k: int = 10, network_name: str = None, plot_path: str = None) -> None: r""" Plot density map of node degrees per time window Parameters: @@ -70,7 +70,7 @@ def _merge(x, y, parent): parent[root_x] = root_y -def connected_components_per_ts(graph: object, +def connected_components_per_ts(graph: tuple, network_name: str = None, plot_path: str = None) -> None: r""" @@ -105,7 +105,7 @@ def connected_components_per_ts(graph: object, return -def size_connected_components(graph: object) -> List[List]: +def size_connected_components(graph: tuple) -> List[List]: r""" Calculate the sizes of connected components per timestamp Returns: @@ -134,7 +134,7 @@ def size_connected_components(graph: object) -> List[List]: return component_sizes -def get_avg_node_engagement(graph: object) -> List[int]: +def get_avg_node_engagement(graph: tuple) -> List[int]: r""" Calculate the average node engagement per timestamp, the average number of distinct nodes that establish diff --git a/tgx/utils/plotting_utils.py b/tgx/utils/plotting_utils.py index 19a7d7a..377e7ad 100644 --- a/tgx/utils/plotting_utils.py +++ b/tgx/utils/plotting_utils.py @@ -104,10 +104,10 @@ def plot_density_map(data, filename, y_title, plot_path = None): Plot a density map using fig and ax ''' # Create a 2D list for color values - max_value = np.max([np.max(inner) for inner in data if inner]) - c = np.zeros((max_value, len(data))) + c = np.zeros((np.max(data), len(data))) for i, row in enumerate(data): for value in row: + # print(value) c[value-1][i] += 1 # Plot @@ -115,10 +115,10 @@ def plot_density_map(data, filename, y_title, plot_path = None): ax = fig.add_subplot(111) norm = mcolors.Normalize(vmin=0, vmax=1) - cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm, aspect='auto') + cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm) cbar = fig.colorbar(cax) - ax.set_title(f"Heatmap of {y_title} Over Time") + ax.set_title("Heatmap of Node Degrees Over Time") ax.set_xlabel('Time', fontsize=20) ax.set_ylabel(y_title, fontsize=20) ax.tick_params(labelsize=20) diff --git a/tgx/utils/stat.py b/tgx/utils/stat.py index fbaf2a7..864427d 100644 --- a/tgx/utils/stat.py +++ b/tgx/utils/stat.py @@ -2,6 +2,7 @@ import networkx as nx import numpy as np from tgx.utils.graph_utils import train_test_split +from typing import List, Dict __all__ = ["degree_over_time", "nodes_over_time", @@ -16,7 +17,10 @@ "get_novelty", "get_avg_node_activity", "get_avg_node_engagement", - "degree_density"] + "degree_density", + "connected_components_per_ts", + "size_connected_components", + "get_avg_node_engagement"] def degree_over_time(graph: object, @@ -38,8 +42,6 @@ def degree_over_time(graph: object, else: filename = "ave_degree_per_ts" plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = filepath) - - print("Plotting Done!") return @@ -62,7 +64,6 @@ def nodes_over_time(graph: object, else: filename = "nodes_per_ts" plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = filepath) - # print("Plotting Done!") return def edges_over_time(graph: object, @@ -83,7 +84,6 @@ def edges_over_time(graph: object, else: filename = "_edges_per_ts" plot_for_snapshots(active_edges, plot_path, filename, "Number of edges", plot_path = filepath) - # print("Plotting Done!") return def nodes_and_edges_over_time(graph: object, @@ -231,69 +231,6 @@ def merge(x, y, parent): if root_x != root_y: parent[root_x] = root_y - -# def size_connected_components(graph) -> list: -# """ -# Calculate the sizes of connected components per timestamp. - -# Returns: -# component_sizes: A list containing the sizes of connected components in each timestamp. -# """ - -# component_sizes = [] -# for t in range(len(graph)): -# parent = list(range(graph[t].number_of_nodes)) - -# for _, edge_data in graph[t].edgelist.items(): -# for (u, v), _ in edge_data.items(): -# merge(u, v, parent) - -# component_sizes_t = {} -# for u in graph[t].nodes(): -# root = find(u, parent) -# if root not in component_sizes_t: -# component_sizes_t[root] = 0 -# component_sizes_t[root] += 1 - -# component_sizes.append(component_sizes_t) - -# return component_sizes - - -# def num_connected_components_per_ts(graph: list, -# network_name: str = None, -# plot_path: str = None) -> None: -# """ - -# Plot the number of connected components per timestamp. - -# """ - -# num_components = [] -# for t in range(len(graph)): -# parent = list(range(graph[t].number_of_nodes)) - -# for _, edge_data in graph[t].edgelist.items(): -# for (u, v), _ in edge_data.items(): -# merge(u, v, parent) - -# num = 0 -# for u in graph[t].nodes(): -# if parent[u] == u: -# num += 1 -# num_components.append(num) - -# if network_name is not None: -# filename = f"{network_name}_num_connected_components_per_ts" -# else: -# filename = "_num_connected_components_per_ts" -# plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) -# print(num_components) -# print("Plotting Done!") - -# return - - def get_reoccurrence(graph:object, test_ratio: float=0.15) -> float: r""" Calculate the recurrence index @@ -406,34 +343,19 @@ def get_avg_node_activity(graph: object) -> float: return avg_node_activity -def get_avg_node_engagement(graph: object): +#* new graph stats added +#TODO to not require k as input but get it from the Graph object +def degree_density(graph : object, k: int = 10, network_name: str = None, plot_path: str = None) -> None: r""" - get the average node engagement over time. - node engagement represents the average number of distinct nodes that establish - at least one new connection during each time step. + Plot density map of node degrees per time window + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + k: number of time windows + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure """ graph_edgelist = graph.data - engaging_nodes = [] - previous_edges = set() - for ts, e_list in graph_edgelist.items(): - node_set = set() - new_edges = {(u, v) for (u, v) in e_list if frozenset({u, v}) not in previous_edges} - for u, v in new_edges: - if u not in node_set: - node_set.add(u) - if v not in node_set: - node_set.add(v) - # engaging_nodes.append((ts, len(node_set))) - engaging_nodes.append(len(node_set)) - previous_edges = {frozenset({u, v}) for (u, v) in e_list} # Update the set of previous edges for the next timestamp - return engaging_nodes - -def degree_density(graph: object, network_name: str = None, k = 10, plot_path: str = None) -> None: - r""" - plot the density map of node degrees over timestamps - """ - graph_edgelist = graph.data degrees_by_k_list = [] temp = [] temp_idx = 0 @@ -462,4 +384,167 @@ def degree_density(graph: object, network_name: str = None, k = 10, plot_path: s filename = "_get_degree_density" plot_density_map(degrees_by_k_list, filename, "Node Degree", plot_path = plot_path) print("Plotting Done!") - return \ No newline at end of file + return + +def _find(x, parent): + if parent[x] == x: + return x + parent[x] = _find(parent[x], parent) + return parent[x] + + +def _merge(x, y, parent): + root_x = _find(x, parent) + root_y = _find(y, parent) + + if root_x != root_y: + parent[root_x] = root_y + + +#TODO to be fixed +def connected_components_per_ts(graph: list, + network_name: str = None, + plot_path: str = None) -> None: + r""" + Plot number of connected components per timestamp + Parameters: + graph: a list containing graph snapshots + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + num_components = [] + for t in range(len(graph)): + parent = list(range(graph[t].number_of_nodes)) + + for _, edge_data in graph[t].edgelist.items(): + for (u, v), _ in edge_data.items(): + _merge(u, v, parent) + + num = 0 + for u in graph[t].nodes(): + if parent[u] == u: + num += 1 + num_components.append(num) + + if network_name is not None: + filename = f"{network_name}_connected_components_per_ts" + else: + filename = "_connected_components_per_ts" + plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) + print(num_components) + print("Plotting Done!") + + return + +#TODO to be fixed +def size_connected_components(graph: list) -> List[Dict]: + r""" + Calculate the sizes of connected components per timestamp + Returns: + list: A list containing the sizes of connected components in each timestamp. + """ + component_sizes = [] + for t in range(len(graph)): + parent = list(range(graph[t].number_of_nodes)) + + for _, edge_data in graph[t].edgelist.items(): + for (u, v), _ in edge_data.items(): + _merge(u, v, parent) + + component_sizes_t = {} + for u in graph[t].nodes(): + root = _find(u, parent) + if root not in component_sizes_t: + component_sizes_t[root] = 0 + component_sizes_t[root] += 1 + + component_sizes.append(component_sizes_t) + + return component_sizes + + +def get_avg_node_engagement(graph_edgelist: dict) -> List[int]: + r""" + Calculate the average node engagement per timestamp, + the average number of distinct nodes that establish + at least one new connection. + Parameters: + graph_edgelist: Dictionary containing graph data + """ + engaging_nodes = [] + previous_edges = set() + for ts, e_list in graph_edgelist.items(): + node_set = set() + new_edges = {(u, v) for (u, v), _ in e_list.items() if frozenset({u, v}) not in previous_edges} + for u, v in new_edges: + if u not in node_set: + node_set.add(u) + if v not in node_set: + node_set.add(v) + engaging_nodes.append(len(node_set)) + previous_edges = {frozenset({u, v}) for (u, v), _ in e_list.items()} # Update the set of previous edges for the next timestamp + return engaging_nodes + + + + +# def size_connected_components(graph) -> list: +# """ +# Calculate the sizes of connected components per timestamp. + +# Returns: +# component_sizes: A list containing the sizes of connected components in each timestamp. +# """ + +# component_sizes = [] +# for t in range(len(graph)): +# parent = list(range(graph[t].number_of_nodes)) + +# for _, edge_data in graph[t].edgelist.items(): +# for (u, v), _ in edge_data.items(): +# merge(u, v, parent) + +# component_sizes_t = {} +# for u in graph[t].nodes(): +# root = find(u, parent) +# if root not in component_sizes_t: +# component_sizes_t[root] = 0 +# component_sizes_t[root] += 1 + +# component_sizes.append(component_sizes_t) + +# return component_sizes + + +# def num_connected_components_per_ts(graph: list, +# network_name: str = None, +# plot_path: str = None) -> None: +# """ + +# Plot the number of connected components per timestamp. + +# """ + +# num_components = [] +# for t in range(len(graph)): +# parent = list(range(graph[t].number_of_nodes)) + +# for _, edge_data in graph[t].edgelist.items(): +# for (u, v), _ in edge_data.items(): +# merge(u, v, parent) + +# num = 0 +# for u in graph[t].nodes(): +# if parent[u] == u: +# num += 1 +# num_components.append(num) + +# if network_name is not None: +# filename = f"{network_name}_num_connected_components_per_ts" +# else: +# filename = "_num_connected_components_per_ts" +# plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) +# print(num_components) +# print("Plotting Done!") + +# return diff --git a/tgx/viz/TEA.py b/tgx/viz/TEA.py index bfa0bae..e684f4d 100644 --- a/tgx/viz/TEA.py +++ b/tgx/viz/TEA.py @@ -211,8 +211,8 @@ def TEA_plot_edges_bar(ts_edges_dist: list, plt.ylabel("Number of edges", fontsize=font_size) plt.legend(fontsize = 13) if filepath is not None: - plt.savefig(f"{filepath}/{network_name}.pdf") - print("plot saved as " + f"{filepath}/{network_name}.pdf") + plt.savefig(f"{filepath}/{network_name}_TEA.pdf") + print("plot saved as " + f"{filepath}/{network_name}_TEA.pdf") if (show): plt.show() diff --git a/tgx/viz/TET.py b/tgx/viz/TET.py index 090a97f..52bb2a9 100644 --- a/tgx/viz/TET.py +++ b/tgx/viz/TET.py @@ -294,7 +294,7 @@ def plot_edge_presence_matrix(e_presence_mat, if fig_param.fig_name is not None: # print("Info: file name: {}".format(fig_param.fig_name)) - plt.savefig(f"{fig_param.fig_name}/{fig_param.network_name}.pdf") + plt.savefig(f"{fig_param.fig_name}/{fig_param.network_name}_TET.pdf") plt.show() print("Info: plotting done!") From 934172ec5220f4b91ca7fb192bdae3b763a9aa5d Mon Sep 17 00:00:00 2001 From: "elahe.kooshafar" Date: Thu, 7 Dec 2023 12:26:34 -0500 Subject: [PATCH 4/7] update --- tgx/classes/__pycache__/graph.cpython-39.pyc | Bin 5111 -> 5364 bytes tgx/classes/graph.py | 10 ++++++++++ tgx/utils/plotting_utils.py | 15 ++++++++++----- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/tgx/classes/__pycache__/graph.cpython-39.pyc b/tgx/classes/__pycache__/graph.cpython-39.pyc index 5632a8e615cce273c7aece09bb28cd7f3c501dc4..8c02d4ee6aeba2e9aadcd83583365d0ecf2e3291 100644 GIT binary patch delta 446 zcmZXOJ4*vm5QTR(yGEj$=*Gl^_%=qEj|q;W2MGv}T&Fdx%z>7c1;0g3AD{gC}6E`l$aH&RS0 z07?ZJrh*@u*Mq%~G7txma+qDn9jbSQ10wS+N=3 KhGlUdp8o|os&an- delta 271 zcmeyO`CXkak(ZZ?0SL?w6{H@Kn#lKD?-`Jn$`Hkv!VtxjqMXW{$|A`CBw4{EngQnW%_lzn`JSzIRc_sR3`pNkzsgo^PdKn!iA7`=M%*I;JE~?4oS0oK| zi6&E#CXg!91FnnNp7$o(EtZ_j;*!a0`CJ+6H-F~KU}UtM93$Y**tdC&Kob+= zoXPsaR*d44vxFmn5{HGAxUE2D*nkMv$p=KlC-aDKF*;2a7qOWfDv|(XZ4 list: r""" diff --git a/tgx/utils/plotting_utils.py b/tgx/utils/plotting_utils.py index 377e7ad..57318be 100644 --- a/tgx/utils/plotting_utils.py +++ b/tgx/utils/plotting_utils.py @@ -99,16 +99,16 @@ def plot_for_snapshots(data: list, plt.show() -def plot_density_map(data, filename, y_title, plot_path = None): +def plot_density_map(data, filename, y_title, plot_path=None): ''' Plot a density map using fig and ax ''' - # Create a 2D list for color values - c = np.zeros((np.max(data), len(data))) + max_value = max(max(inner) for inner in data if inner) + c = np.zeros((max_value, len(data))) + for i, row in enumerate(data): for value in row: - # print(value) - c[value-1][i] += 1 + c[value - 1][i] += 1 # Plot fig = plt.figure(facecolor='w', figsize=(9, 6)) @@ -117,12 +117,17 @@ def plot_density_map(data, filename, y_title, plot_path = None): norm = mcolors.Normalize(vmin=0, vmax=1) cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm) cbar = fig.colorbar(cax) + cbar.set_label('Frequency') ax.set_title("Heatmap of Node Degrees Over Time") ax.set_xlabel('Time', fontsize=20) ax.set_ylabel(y_title, fontsize=20) ax.tick_params(labelsize=20) ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + + # Adjust the aspect ratio of the plot + ax.set_aspect('auto') + if plot_path is not None: plt.savefig(f'{plot_path}/{filename}') plt.show() From 28e46a3cc33d30362109cb43ce57cf4ea1919d32 Mon Sep 17 00:00:00 2001 From: shenyangHuang Date: Sun, 21 Jan 2024 11:56:42 -0500 Subject: [PATCH 5/7] commit changes avoid to lost --- .gitignore | 1 + docs/tutorials/data_loader.ipynb | 2 +- examples/data_viz.py | 116 +- tgx/__init__.py | 28 +- .../.ipynb_checkpoints/graph-checkpoint.py | 168 +-- tgx/classes/__pycache__/graph.cpython-39.pyc | Bin 5364 -> 5338 bytes tgx/classes/graph.py | 424 +++---- tgx/data/builtin.py | 400 +++--- tgx/data/tgb.py | 204 +-- tgx/io/read.py | 484 ++++---- tgx/io/write.py | 82 +- .../.ipynb_checkpoints/edgelist-checkpoint.py | 126 +- .../graph_stat-checkpoint.py | 164 +-- .../plotting_utils-checkpoint.py | 200 +-- tgx/utils/__init__.py | 8 +- tgx/utils/graph_utils.py | 562 ++++----- tgx/utils/newstat.py | 324 ++--- tgx/utils/plotting_utils.py | 269 ++-- tgx/utils/stat.py | 1100 ++++++++--------- tgx/viz/TEA.py | 438 +++---- tgx/viz/TET.py | 670 +++++----- 21 files changed, 2884 insertions(+), 2886 deletions(-) diff --git a/.gitignore b/.gitignore index 7883903..a593edc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ !requirements*.txt #dataset +*.png *.cpython-39.pyc *.pyc *.xz diff --git a/docs/tutorials/data_loader.ipynb b/docs/tutorials/data_loader.ipynb index 8ef5741..f2bd485 100644 --- a/docs/tutorials/data_loader.ipynb +++ b/docs/tutorials/data_loader.ipynb @@ -260,7 +260,7 @@ "\n", "node_engagement = get_avg_node_engagement(data)\n", "filename = f\"{dataset.name}_ave_node_engagement_per_ts\"\n", - "tgx.utils.plot_for_snapshots(node_engagement, filename, \"node engagement\", plot_title=\"Average node engagement\")" + "tgx.utils.plot_for_snapshots(node_engagement, y_title=\"avg. node engagement\", filename= \"./\" + filename)" ] }, { diff --git a/examples/data_viz.py b/examples/data_viz.py index 3984ad6..d74abe7 100644 --- a/examples/data_viz.py +++ b/examples/data_viz.py @@ -1,59 +1,59 @@ -import tgx -from tgx.utils.graph_utils import list2csv - -""" -1. load a dataset -2. load into a graph -3. discretize the graph -4. save the graph back to a csv -""" - -#! load the datasets -# dataset = tgx.builtin.uci() #built in datasets - -data_name = "tgbl-wiki" #"tgbl-review" -dataset = tgx.tgb_data(data_name) #tgb datasets - - -ctdg = tgx.Graph(dataset) -time_scale = "daily" -dtdg = ctdg.discretize(time_scale=time_scale) - - -#! plotting the statistics, works -tgx.degree_over_time(dtdg, network_name=data_name) -tgx.nodes_over_time(dtdg, network_name=data_name) -tgx.edges_over_time(dtdg, network_name=data_name) -tgx.nodes_and_edges_over_time(dtdg, network_name=data_name) - -tgx.TET(dtdg, - network_name=data_name, - figsize = (9, 5), - axis_title_font_size = 24, - ticks_font_size = 24) - - -tgx.TEA(dtdg, - network_name=data_name) - - - -#! compute statistics -test_ratio = 0.15 -tgx.get_reoccurrence(ctdg, test_ratio=test_ratio) -tgx.get_surprise(ctdg, test_ratio=test_ratio) - -#* these two much faster on dtdgs -tgx.get_avg_node_activity(dtdg) -tgx.get_novelty(dtdg) - - - - - -# #! statistics to be updated and fixed -# #TODO -# tgx.degree_density() -# tgx.connected_components_per_ts() -# tgx.size_connected_components() +import tgx +from tgx.utils.graph_utils import list2csv + +""" +1. load a dataset +2. load into a graph +3. discretize the graph +4. save the graph back to a csv +""" + +#! load the datasets +# dataset = tgx.builtin.uci() #built in datasets + +data_name = "tgbl-wiki" #"tgbl-review" +dataset = tgx.tgb_data(data_name) #tgb datasets + + +ctdg = tgx.Graph(dataset) +time_scale = "daily" +dtdg = ctdg.discretize(time_scale=time_scale)[0] + + +#! plotting the statistics, works +tgx.degree_over_time(dtdg, network_name=data_name) +tgx.nodes_over_time(dtdg, network_name=data_name) +tgx.edges_over_time(dtdg, network_name=data_name) +tgx.nodes_and_edges_over_time(dtdg, network_name=data_name) + +tgx.TET(dtdg, + network_name=data_name, + figsize = (9, 5), + axis_title_font_size = 24, + ticks_font_size = 24) + + +tgx.TEA(dtdg, + network_name=data_name) + + + +#! compute statistics +test_ratio = 0.15 +tgx.get_reoccurrence(ctdg, test_ratio=test_ratio) +tgx.get_surprise(ctdg, test_ratio=test_ratio) + +#* these two much faster on dtdgs +tgx.get_avg_node_activity(dtdg) +tgx.get_novelty(dtdg) + + + + + +# #! statistics to be updated and fixed +# #TODO +# tgx.degree_density() +# tgx.connected_components_per_ts() +# tgx.size_connected_components() # tgx.get_avg_node_engagement() \ No newline at end of file diff --git a/tgx/__init__.py b/tgx/__init__.py index 3e370d9..b3718af 100644 --- a/tgx/__init__.py +++ b/tgx/__init__.py @@ -1,14 +1,14 @@ -from tgx.classes.graph import Graph - -from tgx.data.builtin import builtin -from tgx.data.tgb import tgb_data - -from tgx.io.read import read_csv -from tgx.io.write import write_csv - -from tgx.viz.TEA import TEA -from tgx.viz.TET import TET - -from tgx.utils.stat import * -from tgx.utils.graph_utils import * - +from tgx.classes.graph import Graph + +from tgx.data.builtin import builtin +from tgx.data.tgb import tgb_data + +from tgx.io.read import read_csv +from tgx.io.write import write_csv + +from tgx.viz.TEA import TEA +from tgx.viz.TET import TET + +from tgx.utils.stat import * +from tgx.utils.graph_utils import * + diff --git a/tgx/classes/.ipynb_checkpoints/graph-checkpoint.py b/tgx/classes/.ipynb_checkpoints/graph-checkpoint.py index 2a913d6..9c79dc3 100644 --- a/tgx/classes/.ipynb_checkpoints/graph-checkpoint.py +++ b/tgx/classes/.ipynb_checkpoints/graph-checkpoint.py @@ -1,85 +1,85 @@ -import networkx as nx -from typing import Optional - - -class Graph(): - def __init__(self, - edgelist: Optional[dict] = None, - discretized: Optional[bool] = True): - """ - Create a Graph object with specific characteristics - Args: - edgelist: a dictionary of temporal edges in the form of {t: {(u, v), freq}} - discretized: whether the given edgelist was discretized or not - """ - - self.edgelist = edgelist - self.subsampled_graph = None - if discretized: - self.discrite_graph = self._generate_graph() - self.discrite_edgelist = edgelist - else: - self.continuous_edgelist = edgelist - - - def number_of_nodes(self, edgelist: dict = None) -> int: - """ - Calculate total number of nodes present in an edgelist - """ - if self.edgelist is None: - return [] - elif edgelist is None: - edgelist = self.edgelist - node_list = {} - for _, edge_data in edgelist.items(): - for (u,v), _ in edge_data.items(): - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return len(node_list.keys()) - - def nodes(self) -> list: - """ - Return a list of nodes present in an edgelist - """ - node_list = {} - for _, edge_data in self.edgelist.items(): - for (u,v), _ in edge_data.items(): - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - - self.node_list = list(node_list.keys()) - return list(node_list.keys()) - - def _generate_graph(self, - edgelist: Optional[dict] = None - ) -> list: - ''' - Generate a list of graph snapshots. Each snapshot is a - Networkx graph object. - Parameters: - edgelist: a dictionary containing in the form of {t: {(u, v), freq}} - Returns: - G_times: a list of networkx graphs - ''' - if self.edgelist is None: - return [] - elif edgelist is None: - edgelist = self.edgelist - G_times = [] - G = nx.Graph() - cur_t = 0 - for ts, edge_data in edgelist.items(): - for (u,v), n in edge_data.items(): - if (ts != cur_t): - G_times.append(G) - G = nx.Graph() - cur_t = ts - G.add_edge(u, v, freq=n) - G_times.append(G) - return G_times - +import networkx as nx +from typing import Optional + + +class Graph(): + def __init__(self, + edgelist: Optional[dict] = None, + discretized: Optional[bool] = True): + """ + Create a Graph object with specific characteristics + Args: + edgelist: a dictionary of temporal edges in the form of {t: {(u, v), freq}} + discretized: whether the given edgelist was discretized or not + """ + + self.edgelist = edgelist + self.subsampled_graph = None + if discretized: + self.discrite_graph = self._generate_graph() + self.discrite_edgelist = edgelist + else: + self.continuous_edgelist = edgelist + + + def number_of_nodes(self, edgelist: dict = None) -> int: + """ + Calculate total number of nodes present in an edgelist + """ + if self.edgelist is None: + return [] + elif edgelist is None: + edgelist = self.edgelist + node_list = {} + for _, edge_data in edgelist.items(): + for (u,v), _ in edge_data.items(): + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + return len(node_list.keys()) + + def nodes(self) -> list: + """ + Return a list of nodes present in an edgelist + """ + node_list = {} + for _, edge_data in self.edgelist.items(): + for (u,v), _ in edge_data.items(): + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + + self.node_list = list(node_list.keys()) + return list(node_list.keys()) + + def _generate_graph(self, + edgelist: Optional[dict] = None + ) -> list: + ''' + Generate a list of graph snapshots. Each snapshot is a + Networkx graph object. + Parameters: + edgelist: a dictionary containing in the form of {t: {(u, v), freq}} + Returns: + G_times: a list of networkx graphs + ''' + if self.edgelist is None: + return [] + elif edgelist is None: + edgelist = self.edgelist + G_times = [] + G = nx.Graph() + cur_t = 0 + for ts, edge_data in edgelist.items(): + for (u,v), n in edge_data.items(): + if (ts != cur_t): + G_times.append(G) + G = nx.Graph() + cur_t = ts + G.add_edge(u, v, freq=n) + G_times.append(G) + return G_times + \ No newline at end of file diff --git a/tgx/classes/__pycache__/graph.cpython-39.pyc b/tgx/classes/__pycache__/graph.cpython-39.pyc index 8c02d4ee6aeba2e9aadcd83583365d0ecf2e3291..69fee6614d63fa481f4d4a6b8754fb3bb1923e5c 100644 GIT binary patch delta 41 vcmeyOc}tTok(ZZ?0SN59)}~&P*~rJr!lR;}n^&Trrk|XjlDgTFWw!tT?W_yU delta 67 zcmcbm`9+g2k(ZZ?0SLOj6{h}^-pI$wqGqR`k)NBYpPQMJsGq8znv object: - """ - discretize the graph object based on the given time interval - Args: - time_scale: time interval to discretize the graph - """ - new_G = copy.deepcopy(self) - # discretie differently based on # of intervals of time granularity - output = discretize_edges(self.data, - time_scale = time_scale, - store_unix = store_unix) - disc_G = output[0] - new_G.data = disc_G - if (store_unix): - return new_G, output[1] - else: - return (new_G, ) - - def count_freq(self): - self.freq_data = frequency_count(self.data) - return self - - def subsampling(self, - node_list: Optional[list] = [], - random_selection: Optional[bool] = True, - N: Optional[int] = None) -> object: - new_G = copy.deepcopy(self) - new_G.data = subsampling(new_G, node_list = node_list, random_selection=random_selection, N=N) - return new_G - - def number_of_edges(self) -> int: - r""" - Calculate total number of nodes present in an edgelist - """ - edgelist = self.data - e_num = 0 - for _, edges in edgelist.items(): - e_num += len(edges) - - return e_num - - def unique_edges(self) -> int: - r""" - Calculate the number of unique edges - Parameters: - graph_edgelist: Dictionary containing graph data - """ - unique_edges = {} - for _, e_list in self.data.items(): - for e in e_list: - if e not in unique_edges: - unique_edges[e] = 1 - return len(unique_edges) - - def total_nodes(self) -> int: - r""" - Calculate total number of nodes present in an edgelist - """ - - edgelist = self.data - node_list = {} - for _, edge_data in edgelist.items(): - for edge in edge_data: - (u, v) = edge - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return len(node_list.keys()) - - def node_per_ts(self): - active_nodes = {} - for ts in range(len(self.data)): - edgelist_t = self.data[ts] - active_nodes.append(self.edgelist_node_count(edgelist_t)) - return active_nodes - - def edgelist_node_count(self, edge_data: list): - node_list = {} - for edge in edge_data: - (u, v) = edge - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return len(node_list.keys()) - - def edgelist_node_list(self, edge_data: list): - node_list = {} - for edge in edge_data: - (u, v) = edge - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return list(node_list.keys()) - - def nodes_list(self) -> list: - r""" - Return a list of nodes present in an edgelist - """ - node_list = {} - for _, edge_data in self.edgelist.items(): - for (u,v), _ in edge_data.items(): - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - - self.node_list = list(node_list.keys()) - return list(node_list.keys()) - - def check_time_gap(self) -> bool: - r""" - Check whether the edgelist timestamps have gaps or not (increments bigger than 1) - Returns: - time_gap: a boolean indicating whether there is a time gap or not - """ - time_gap = False - ts = list(self.data.keys()) - for i in range(1, len(ts)): - if ts[i] - ts[i-1] > 1: - time_gap = True - return time_gap - return time_gap - - def save2csv(self, - fname:str = "output") -> None: - r""" - Save the graph object in an edgelist format to a csv file - Args: - fname: name of the csv file to save the graph, no csv suffix needed - """ - outname = fname + ".csv" - #iterate through all edges - with open(outname, 'w') as csvfile: - print ("saving to ", outname) - csvwriter = csv.writer(csvfile, delimiter=',') - csvwriter.writerow(['timestamp'] + ['source'] + ['destination']) - for t, edges_list in self.data.items(): - for edge in edges_list: - (u, v) = edge - csvwriter.writerow([t] + [u] + [v]) - - - # def _generate_graph(self, - # edgelist: Optional[dict] = None - # ) -> list: - # r''' - # Generate a list of graph snapshots. Each snapshot is a - # Networkx graph object. - # Parameters: - # edgelist: a dictionary containing in the form of {t: {(u, v), freq}} - # Returns: - # G_times: a list of networkx graphs - # ''' - # if self.edgelist is None: - # return [] - # elif edgelist is None: - # edgelist = self.edgelist - # G_times = [] - # G = nx.Graph() - # cur_t = 0 - # for ts, edge_data in edgelist.items(): - # for (u,v), n in edge_data.items(): - # if (ts != cur_t): - # G_times.append(G) - # G = nx.Graph() - # cur_t = ts - # G.add_edge(u, v, freq=n) - # G_times.append(G) - # return G_times - +# import networkx as nx +from typing import Optional, Union +from tgx.utils.graph_utils import discretize_edges, frequency_count, subsampling +from tgx.io.read import read_csv +import copy +import csv + +#TODO should contain a new property tracking the number of timestamps +class Graph(object): + def __init__(self, + dataset: Optional[object] = None, + fname: Optional[str] = None, + edgelist: Optional[dict] = None): + """ + Create a Graph object with specific characteristics + Args: + dataset: a dataset object + edgelist: a dictionary of temporal edges in the form of {t: {(u, v), freq}} + """ + + if dataset is not None: + if isinstance(dataset, type) or isinstance(dataset,object): + #! not sure why read csv here + self.data = read_csv(dataset) + elif fname is not None and isinstance(fname, str): + self.data = read_csv(fname) + elif edgelist is not None and isinstance(edgelist, dict): + self.data = edgelist + else: + raise TypeError("Please enter valid input.") + + self.subsampled_graph = None + self.freq_data = None + + + + + def discretize(self, + time_scale: Union[str, int], + store_unix: bool = False) -> object: + """ + discretize the graph object based on the given time interval + Args: + time_scale: time interval to discretize the graph + """ + new_G = copy.deepcopy(self) + # discretie differently based on # of intervals of time granularity + output = discretize_edges(self.data, + time_scale = time_scale, + store_unix = store_unix) + disc_G = output[0] + new_G.data = disc_G + if (store_unix): + return new_G, output[1] + else: + return (new_G, ) + + def count_freq(self): + self.freq_data = frequency_count(self.data) + return self + + def subsampling(self, + node_list: Optional[list] = [], + random_selection: Optional[bool] = True, + N: Optional[int] = None) -> object: + new_G = copy.deepcopy(self) + new_G.data = subsampling(new_G, node_list = node_list, random_selection=random_selection, N=N) + return new_G + + def number_of_edges(self) -> int: + r""" + Calculate total number of nodes present in an edgelist + """ + edgelist = self.data + e_num = 0 + for _, edges in edgelist.items(): + e_num += len(edges) + + return e_num + + def unique_edges(self) -> int: + r""" + Calculate the number of unique edges + Parameters: + graph_edgelist: Dictionary containing graph data + """ + unique_edges = {} + for _, e_list in self.data.items(): + for e in e_list: + if e not in unique_edges: + unique_edges[e] = 1 + return len(unique_edges) + + def total_nodes(self) -> int: + r""" + Calculate total number of nodes present in an edgelist + """ + + edgelist = self.data + node_list = {} + for _, edge_data in edgelist.items(): + for edge in edge_data: + (u, v) = edge + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + return len(node_list.keys()) + + def node_per_ts(self): + active_nodes = {} + for ts in range(len(self.data)): + edgelist_t = self.data[ts] + active_nodes.append(self.edgelist_node_count(edgelist_t)) + return active_nodes + + def edgelist_node_count(self, edge_data: list): + node_list = {} + for edge in edge_data: + (u, v) = edge + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + return len(node_list.keys()) + + def edgelist_node_list(self, edge_data: list): + node_list = {} + for edge in edge_data: + (u, v) = edge + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + return list(node_list.keys()) + + def nodes_list(self) -> list: + r""" + Return a list of nodes present in an edgelist + """ + node_list = {} + for _, edge_data in self.edgelist.items(): + for (u,v), _ in edge_data.items(): + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + + self.node_list = list(node_list.keys()) + return list(node_list.keys()) + + def check_time_gap(self) -> bool: + r""" + Check whether the edgelist timestamps have gaps or not (increments bigger than 1) + Returns: + time_gap: a boolean indicating whether there is a time gap or not + """ + time_gap = False + ts = list(self.data.keys()) + for i in range(1, len(ts)): + if ts[i] - ts[i-1] > 1: + time_gap = True + return time_gap + return time_gap + + def save2csv(self, + fname:str = "output") -> None: + r""" + Save the graph object in an edgelist format to a csv file + Args: + fname: name of the csv file to save the graph, no csv suffix needed + """ + outname = fname + ".csv" + #iterate through all edges + with open(outname, 'w') as csvfile: + print ("saving to ", outname) + csvwriter = csv.writer(csvfile, delimiter=',') + csvwriter.writerow(['timestamp'] + ['source'] + ['destination']) + for t, edges_list in self.data.items(): + for edge in edges_list: + (u, v) = edge + csvwriter.writerow([t] + [u] + [v]) + + + # def _generate_graph(self, + # edgelist: Optional[dict] = None + # ) -> list: + # r''' + # Generate a list of graph snapshots. Each snapshot is a + # Networkx graph object. + # Parameters: + # edgelist: a dictionary containing in the form of {t: {(u, v), freq}} + # Returns: + # G_times: a list of networkx graphs + # ''' + # if self.edgelist is None: + # return [] + # elif edgelist is None: + # edgelist = self.edgelist + # G_times = [] + # G = nx.Graph() + # cur_t = 0 + # for ts, edge_data in edgelist.items(): + # for (u,v), n in edge_data.items(): + # if (ts != cur_t): + # G_times.append(G) + # G = nx.Graph() + # cur_t = ts + # G.add_edge(u, v, freq=n) + # G_times.append(G) + # return G_times + \ No newline at end of file diff --git a/tgx/data/builtin.py b/tgx/data/builtin.py index e66ad3c..fb9f005 100644 --- a/tgx/data/builtin.py +++ b/tgx/data/builtin.py @@ -1,200 +1,200 @@ -import pandas as pd -import zipfile -import urllib - -__all__ = ["data"] - -root_path = "." - -DataPath={ - 'USLegis' : "/data/USLegis/ml_USLegis.csv", - 'CanParl' : "/data/CanParl/ml_CanParl.csv", - 'UNtrade' : "/data/UNtrade/ml_UNtrade.csv", - 'UNvote' : "/data/UNvote/ml_UNvote.csv", - 'reddit' : "/data/reddit/ml_reddit.csv", - 'Wikipedia' : "/data/wikipedia/ml_wikipedia.csv", - 'enron' : "/data/enron/ml_enron.csv", - 'mooc' : "/data/mooc/ml_mooc.csv", - 'uci' : "/data/uci/ml_uci.csv", - 'SocialEvo' : "/data/SocialEvo/ml_SocialEvo.csv", - 'Flights' : "/data/Flights/ml_Flights.csv", - 'lastfm' : "/data/lastfm/ml_lastfm.csv", - 'Contacts' : "/data/Contacts/ml_Contacts.csv" - } - -Data_specifications = { - 'USLegis' : {'discretize' : False, 'time_scale': None}, - 'CanParl' : {'discretize' : False, 'time_scale': None}, - 'UNvote' : {'discretize' : False, 'time_scale': None}, - 'reddit' : {'discretize' : True, 'time_scale': 'daily'}, - 'enron' : {'discretize' : True, 'time_scale': 'monthly'}, - 'mooc' : {'discretize' : True, 'time_scale': 'daily'}, - 'uci' : {'discretize' : True, 'time_scale': 'weekly'}, - 'SocialEvo' : {'discretize' : True, 'time_scale': 'weekly'}, - 'Flights' : {'discretize' : False, 'time_scale': 121}, - 'Contacts' : {'discretize' : True, 'time_scale': 'daily'}, - 'lastfm' : {'discretize' : True, 'time_scale': 'monthly'} - } - -class builtin(object): - def __init__(self): - """ - Data class for loading default (in-package) temporal datasets - - In order to use "tgb" datasets install tgb package - for more detals visit here: https://tgb.complexdatalab.com/ - - In order to use dgb datasets download and extract dataset file - from here: https://zenodo.org/record/7213796#.Y1cO6y8r30o - and locate them in ./data/ directory. - """ - pass - - - def read_specifications(self, - data: type): - """ - Load dataset specifications for dgb datasets - Parameters: - data: str, name of the dataset - """ - self.name = data - self.path = DataPath[data] - # self.header = Data_specifications[data]['header'] - # self.index = Data_specifications[data]['index'] - self.discretize = Data_specifications[data]['discretize'] - self.time_scale = Data_specifications[data]['time_scale'] - return self - - def load_dgb_data(self): - try: - data = pd.read_csv(f"{self.root}{self.path}", index_col=0) - except: - self.download_file(self) - data = pd.read_csv(f"{self.root}{self.path}", index_col=0) - - self.data = data.iloc[:, 0:3].to_numpy() - return self - - def download_file(self): - - print("Data missing, download recommended!") - inp = input('Will you download the dataset(s) now? (y/N)\n').lower() - url = f"https://zenodo.org/record/7213796/files/{self.name}.zip" - path_download = f"./data" - print(path_download) - print(url) - if inp == 'y': - print(f"Download started, this might take a while . . .") - zip_path, _ = urllib.request.urlretrieve(url) - with zipfile.ZipFile(zip_path, "r") as f: - f.extractall(path_download) - print("Download completed") - - else: - print("Download cancelled") - - - @classmethod - def mooc(self, root=root_path): - data = "mooc" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def uci(self, root=root_path): - data = "uci" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def uslegis(self, root=root_path): - data = "USLegis" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def canparl(self, root=root_path): - data = "CanParl" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def untrade(self, root=root_path): - data = "UNtrade" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def unvote(self, root=root_path): - data = "UNvote" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def reddit(self, root=root_path): - data = "reddit" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def wikipedia(self, root=root_path): - data = "Wikipedia" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def enron(self, root=root_path): - data = "enron" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def social_evo(self, root=root_path): - data = "SocialEvo" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def flights(self, root=root_path): - data = "Flights" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def lastfm(self, root=root_path): - data = "lastfm" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self - - @classmethod - def contacts(self, root=root_path): - data = "Contacts" - self.root = root - self.read_specifications(self, data) - self.load_dgb_data(self) - return self +import pandas as pd +import zipfile +import urllib + +__all__ = ["data"] + +root_path = "." + +DataPath={ + 'USLegis' : "/data/USLegis/ml_USLegis.csv", + 'CanParl' : "/data/CanParl/ml_CanParl.csv", + 'UNtrade' : "/data/UNtrade/ml_UNtrade.csv", + 'UNvote' : "/data/UNvote/ml_UNvote.csv", + 'reddit' : "/data/reddit/ml_reddit.csv", + 'Wikipedia' : "/data/wikipedia/ml_wikipedia.csv", + 'enron' : "/data/enron/ml_enron.csv", + 'mooc' : "/data/mooc/ml_mooc.csv", + 'uci' : "/data/uci/ml_uci.csv", + 'SocialEvo' : "/data/SocialEvo/ml_SocialEvo.csv", + 'Flights' : "/data/Flights/ml_Flights.csv", + 'lastfm' : "/data/lastfm/ml_lastfm.csv", + 'Contacts' : "/data/Contacts/ml_Contacts.csv" + } + +Data_specifications = { + 'USLegis' : {'discretize' : False, 'time_scale': None}, + 'CanParl' : {'discretize' : False, 'time_scale': None}, + 'UNvote' : {'discretize' : False, 'time_scale': None}, + 'reddit' : {'discretize' : True, 'time_scale': 'daily'}, + 'enron' : {'discretize' : True, 'time_scale': 'monthly'}, + 'mooc' : {'discretize' : True, 'time_scale': 'daily'}, + 'uci' : {'discretize' : True, 'time_scale': 'weekly'}, + 'SocialEvo' : {'discretize' : True, 'time_scale': 'weekly'}, + 'Flights' : {'discretize' : False, 'time_scale': 121}, + 'Contacts' : {'discretize' : True, 'time_scale': 'daily'}, + 'lastfm' : {'discretize' : True, 'time_scale': 'monthly'} + } + +class builtin(object): + def __init__(self): + """ + Data class for loading default (in-package) temporal datasets + + In order to use "tgb" datasets install tgb package + for more detals visit here: https://tgb.complexdatalab.com/ + + In order to use dgb datasets download and extract dataset file + from here: https://zenodo.org/record/7213796#.Y1cO6y8r30o + and locate them in ./data/ directory. + """ + pass + + + def read_specifications(self, + data: type): + """ + Load dataset specifications for dgb datasets + Parameters: + data: str, name of the dataset + """ + self.name = data + self.path = DataPath[data] + # self.header = Data_specifications[data]['header'] + # self.index = Data_specifications[data]['index'] + self.discretize = Data_specifications[data]['discretize'] + self.time_scale = Data_specifications[data]['time_scale'] + return self + + def load_dgb_data(self): + try: + data = pd.read_csv(f"{self.root}{self.path}", index_col=0) + except: + self.download_file(self) + data = pd.read_csv(f"{self.root}{self.path}", index_col=0) + + self.data = data.iloc[:, 0:3].to_numpy() + return self + + def download_file(self): + + print("Data missing, download recommended!") + inp = input('Will you download the dataset(s) now? (y/N)\n').lower() + url = f"https://zenodo.org/record/7213796/files/{self.name}.zip" + path_download = f"./data" + print(path_download) + print(url) + if inp == 'y': + print(f"Download started, this might take a while . . .") + zip_path, _ = urllib.request.urlretrieve(url) + with zipfile.ZipFile(zip_path, "r") as f: + f.extractall(path_download) + print("Download completed") + + else: + print("Download cancelled") + + + @classmethod + def mooc(self, root=root_path): + data = "mooc" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def uci(self, root=root_path): + data = "uci" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def uslegis(self, root=root_path): + data = "USLegis" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def canparl(self, root=root_path): + data = "CanParl" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def untrade(self, root=root_path): + data = "UNtrade" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def unvote(self, root=root_path): + data = "UNvote" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def reddit(self, root=root_path): + data = "reddit" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def wikipedia(self, root=root_path): + data = "Wikipedia" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def enron(self, root=root_path): + data = "enron" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def social_evo(self, root=root_path): + data = "SocialEvo" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def flights(self, root=root_path): + data = "Flights" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def lastfm(self, root=root_path): + data = "lastfm" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self + + @classmethod + def contacts(self, root=root_path): + data = "Contacts" + self.root = root + self.read_specifications(self, data) + self.load_dgb_data(self) + return self diff --git a/tgx/data/tgb.py b/tgx/data/tgb.py index 8ef0851..a171709 100644 --- a/tgx/data/tgb.py +++ b/tgx/data/tgb.py @@ -1,103 +1,103 @@ -import numpy as np - -Data_specifications = { - 'tgbl-wiki' : {'discretize' : True, 'time_scale': 'daily'}, - 'tgbl-review' : {'discretize' : True, 'time_scale': 'yearly'}, - 'tgbl-coin' : {'discretize' : True, 'time_scale': 'weekly'}, - 'tgbl-comment' : {'discretize' : True, 'time_scale': 'monthly'}, - 'tgbl-flight' : {'discretize' : True, 'time_scale': 'monthly'}, - 'tgbn-trade' : {'discretize' : False, 'time_scale': None}, - 'tgbn-genre' : {'discretize' : True, 'time_scale': 'monthly'}, - 'tgbn-reddit' : {'discretize' : True, 'time_scale': 'monthly'} - } - -class tgb_data(object): - def __init__(self, dname: str, - edge_feat: bool = False, - w: bool = False, - edge_label: bool = False, - edge_idxs: bool = False): - """ - Data class for loading default (in-package) temporal datasets - - In order to use "tgb" datasets install tgb package - for more detals visit here: https://tgb.complexdatalab.com/ - - In order to use dgb datasets download and extract dataset file - from here: https://zenodo.org/record/7213796#.Y1cO6y8r30o - and locate them in ./data/ directory. - """ - self.tgb(dname, - edge_feat = edge_feat, - w = w, - edge_label = edge_label, - edge_idxs = edge_idxs) - - return - - @classmethod - def tgb(self, dname: str, - edge_feat: bool = False, - w: bool = False, - edge_label: bool = False, - edge_idxs: bool = False): - """ - Load datasets from "tgb" package. To load these datasets you need to install tgb package. - Parameters: - dname: str, name of the dataset from the list: - ["tgbl-wiki", "tgbl-review", - "tgbl-coin", "tgbl-comment", - "tgbl-flight","tgbn-trade", - "tgbn-genre", "tgbn-reddit"] - edge_feat: list of edge features - w: edge weights - edge_label: edge labels - edge_idxs: edge indexes - - """ - try: - from tgb.linkproppred.dataset import LinkPropPredDataset - from tgb.nodeproppred.dataset import NodePropPredDataset - except: - print("First install TGB package using 'pip install py-tgb'") - - link_pred = ["tgbl-wiki", "tgbl-review", "tgbl-coin", "tgbl-comment", "tgbl-flight"] - node_pred = ["tgbn-trade", "tgbn-genre", "tgbn-reddit"] - if dname in link_pred: - data = LinkPropPredDataset(name=dname, root="datasets", preprocess=True) - elif dname in node_pred: - data = NodePropPredDataset(name=dname, root="datasets", preprocess=True) - else: - raise ValueError("Invalid tgb dataset name") - - data = data.full_data - data = np.array([data['sources'], data["destinations"], data["timestamps"]]) - self.data = np.transpose(data) - - if edge_feat: - self.edge_feat = data['edge_feat'] - if w: - self.w = data['w'] - if edge_label: - self.edge_label = data['edge_label'] - if edge_idxs: - self.edge_idxs = data['edge_idxs'] - - self.discretize = Data_specifications[dname]['discretize'] - self.time_scale = Data_specifications[dname]['time_scale'] - self.name = dname - - return self - - - def read_specifications(self, - data: type): - """ - Load dataset specifications for dgb datasets - Parameters: - data: str, name of the dataset - """ - self.name = data - self.discretize = Data_specifications[data]['discretize'] - self.time_scale = Data_specifications[data]['time_scale'] +import numpy as np + +Data_specifications = { + 'tgbl-wiki' : {'discretize' : True, 'time_scale': 'daily'}, + 'tgbl-review' : {'discretize' : True, 'time_scale': 'yearly'}, + 'tgbl-coin' : {'discretize' : True, 'time_scale': 'weekly'}, + 'tgbl-comment' : {'discretize' : True, 'time_scale': 'monthly'}, + 'tgbl-flight' : {'discretize' : True, 'time_scale': 'monthly'}, + 'tgbn-trade' : {'discretize' : False, 'time_scale': None}, + 'tgbn-genre' : {'discretize' : True, 'time_scale': 'monthly'}, + 'tgbn-reddit' : {'discretize' : True, 'time_scale': 'monthly'} + } + +class tgb_data(object): + def __init__(self, dname: str, + edge_feat: bool = False, + w: bool = False, + edge_label: bool = False, + edge_idxs: bool = False): + """ + Data class for loading default (in-package) temporal datasets + + In order to use "tgb" datasets install tgb package + for more detals visit here: https://tgb.complexdatalab.com/ + + In order to use dgb datasets download and extract dataset file + from here: https://zenodo.org/record/7213796#.Y1cO6y8r30o + and locate them in ./data/ directory. + """ + self.tgb(dname, + edge_feat = edge_feat, + w = w, + edge_label = edge_label, + edge_idxs = edge_idxs) + + return + + @classmethod + def tgb(self, dname: str, + edge_feat: bool = False, + w: bool = False, + edge_label: bool = False, + edge_idxs: bool = False): + """ + Load datasets from "tgb" package. To load these datasets you need to install tgb package. + Parameters: + dname: str, name of the dataset from the list: + ["tgbl-wiki", "tgbl-review", + "tgbl-coin", "tgbl-comment", + "tgbl-flight","tgbn-trade", + "tgbn-genre", "tgbn-reddit"] + edge_feat: list of edge features + w: edge weights + edge_label: edge labels + edge_idxs: edge indexes + + """ + try: + from tgb.linkproppred.dataset import LinkPropPredDataset + from tgb.nodeproppred.dataset import NodePropPredDataset + except: + print("First install TGB package using 'pip install py-tgb'") + + link_pred = ["tgbl-wiki", "tgbl-review", "tgbl-coin", "tgbl-comment", "tgbl-flight"] + node_pred = ["tgbn-trade", "tgbn-genre", "tgbn-reddit"] + if dname in link_pred: + data = LinkPropPredDataset(name=dname, root="datasets", preprocess=True) + elif dname in node_pred: + data = NodePropPredDataset(name=dname, root="datasets", preprocess=True) + else: + raise ValueError("Invalid tgb dataset name") + + data = data.full_data + data = np.array([data['sources'], data["destinations"], data["timestamps"]]) + self.data = np.transpose(data) + + if edge_feat: + self.edge_feat = data['edge_feat'] + if w: + self.w = data['w'] + if edge_label: + self.edge_label = data['edge_label'] + if edge_idxs: + self.edge_idxs = data['edge_idxs'] + + self.discretize = Data_specifications[dname]['discretize'] + self.time_scale = Data_specifications[dname]['time_scale'] + self.name = dname + + return self + + + def read_specifications(self, + data: type): + """ + Load dataset specifications for dgb datasets + Parameters: + data: str, name of the dataset + """ + self.name = data + self.discretize = Data_specifications[data]['discretize'] + self.time_scale = Data_specifications[data]['time_scale'] return self \ No newline at end of file diff --git a/tgx/io/read.py b/tgx/io/read.py index 769a23d..4978be1 100644 --- a/tgx/io/read.py +++ b/tgx/io/read.py @@ -1,242 +1,242 @@ -import pandas as pd -import csv -import numpy as np -from typing import Optional, Union -# from tgx.datasets.data_loader import read_dataset - - -# data: Optional[object] = None, -# is_discretized: bool = False, -# discretize: bool = False, -# time_scale: Union[str, int, None] = None, - -def read_csv(fname: Union[str, object] = None, - header: bool = False, - index: bool = False, - t_col: int = 2,) -> dict: - - """ - Read temporal edgelist and store it in a dictionary. - Parameters: - fname: directory of a dataset in .csv format or data object created from loading dgb/tgb datasets - header: whether first line of data file is header - index: whether the first column is row indices - t_col: column indext for timestamps (0 or 2) - ts_sorted: if data are sorted based on timestamp - - Returns: - temp_edgelist: A dictionary of edges and their frequency at each time interval - """ - - start_col = 0 - if index: - start_col = 1 - t_col += 1 - - if t_col < 2: - u_col = t_col + 1 - else: - u_col = start_col - v_col = u_col + 1 - - cols_to_read = [u_col, v_col, t_col] - - if isinstance(fname, type) or isinstance(fname, object): - return _datasets_edgelist_loader(fname.data) - elif isinstance(fname, str): - return _load_edgelist(fname, cols_to_read, header=header) - else: - raise TypeError("Invalid input") - - -def _load_edgelist(fname, columns, header): - """ - read edges from the file and store them in a dictionary - Parameters: - fname: file address - columns: order of the nodes and timestamp - header: Whether the data file contains header - """ - try: - edgelist = open(fname, "r") - except: - raise FileNotFoundError("No such file or directory.") - edgelist.readline() - lines = list(edgelist.readlines()) - edgelist.close() - - u_idx, v_idx, ts_idx = columns - temp_edgelist = {} - unique_edges = {} - edges_list = [] - total_edges = 0 - sorted = True - previous_t = 0 - if header: - first_line = 1 - else: - first_line = 0 - for i in range(first_line, len(lines)): - line = lines[i] - values = line.split(',') - t = int(float(values[ts_idx])) - u = values[u_idx] - v = values[v_idx] - - if i == first_line: - curr_t = t - - # Check if the dataset is sorted - if t < previous_t: - sorted = False - previous_t = t - - if t not in temp_edgelist: - temp_edgelist[t] = [] - - # temp_edgelist[curr_t] = edges_list - # edges_list = [] - # curr_t = t - - temp_edgelist[t].append((u, v)) - if (u,v) not in unique_edges: - unique_edges[(u, v)] = 1 - total_edges += 1 - # temp_edgelist[curr_t] = edges_list - - if sorted is False: - print("edgelist not sorted, sorting dataset...") - myKeys = list(temp_edgelist.keys()) - myKeys.sort() - temp_edgelist = {i: temp_edgelist[i] for i in myKeys} - - print("Number of loaded edges: " + str(total_edges)) - print("Number of unique edges:" , len(unique_edges.keys())) - print("Available timestamps: ", len(temp_edgelist.keys())) - return temp_edgelist - -def _datasets_edgelist_loader(data) -> dict: - """ - load built-in datasets and tgb datasets - """ - temp_edgelist = {} - total_edges = 0 - unique_edges = {} - first_line = 0 - previous_t = 0 - edges_list = [] - sorted = True - for line in data: - u = line[0] - v = line[1] - t = int(float(line[2])) - if first_line == 0: - curr_t = t - first_line += 1 - - # Check if the dataset is sorted - if t < previous_t: - sorted = False - previous_t = t - - if t != curr_t: - temp_edgelist[curr_t] = edges_list - edges_list = [] - curr_t = t - - edges_list.append((u, v)) - if (u,v) not in unique_edges: - unique_edges[(u, v)] = 1 - total_edges += 1 - temp_edgelist[curr_t] = edges_list - - if sorted is False: - print("Sorting dataset...") - myKeys = list(temp_edgelist.keys()) - myKeys.sort() - temp_edgelist = {i: temp_edgelist[i] for i in myKeys} - - print("Number of loaded edges: " + str(total_edges)) - print("Number of unique edges:" + str(len(unique_edges.keys()))) - print("Available timestamps: ", len(temp_edgelist.keys())) - - return temp_edgelist - - -def _load_edgelist_with_discretizer( - fname : str, - columns : list, - time_scale : Union[str , int] = 86400, - header : Optional[bool] = True) -> dict: - """ - load temporal edgelist into a dictionary - assumption: the edges are ordered in increasing order of their timestamp - ''' - the timestamp in the edgelist is based cardinal - more detail see here: https://github.com/srijankr/jodie - need to merge edges in a period of time into an interval - 86400 is # of secs in a day, good interval size - ''' - """ - # print("Info: Interval size:", interval_size) - edgelist = open(fname, "r") - edgelist.readline() - lines = list(edgelist.readlines()) - edgelist.close() - - - u_idx, v_idx, ts_idx = columns - - if isinstance(time_scale, str): - if time_scale == "daily": - interval_size = 86400 - elif time_scale == "weekly": - interval_size = 86400 * 7 - elif time_scale == "monthly": - interval_size = 86400 * 30 - elif time_scale == "yearly": - interval_size = 86400* 365 - elif isinstance(time_scale, int): - last_line = lines[-1] - values = last_line.split(',') - total_time = float(values[ts_idx]) - interval_size = int(total_time / (time_scale-1)) - else: - raise TypeError("Invalid time interval") - - temporal_edgelist = {} - total_n_edges = 0 - - if header: - first_line = 1 - else: - first_line = 0 - - - for i in range(first_line, len(lines)): - line = lines[i] - values = line.split(',') - - total_n_edges += 1 - # values = line.strip().split(',') - u = values[u_idx] # source node - v = values[v_idx] # destination node - ts = float(values[ts_idx]) # timestamp - ts_bin_id = int(ts / interval_size) - if ts_bin_id not in temporal_edgelist: - temporal_edgelist[ts_bin_id] = {} - temporal_edgelist[ts_bin_id][(u, v)] = 1 - else: - if (u, v) not in temporal_edgelist[ts_bin_id]: - temporal_edgelist[ts_bin_id][(u, v)] = 1 - else: - temporal_edgelist[ts_bin_id][(u, v)] += 1 - - print("Loading edge-list: Maximum timestamp is ", ts) - print("Loading edge-list: Maximum timestamp-bin-id is", ts_bin_id) - print("Loading edge-list: Total number of edges:", total_n_edges) - return temporal_edgelist - - - - +import pandas as pd +import csv +import numpy as np +from typing import Optional, Union +# from tgx.datasets.data_loader import read_dataset + + +# data: Optional[object] = None, +# is_discretized: bool = False, +# discretize: bool = False, +# time_scale: Union[str, int, None] = None, + +def read_csv(fname: Union[str, object] = None, + header: bool = False, + index: bool = False, + t_col: int = 2,) -> dict: + + """ + Read temporal edgelist and store it in a dictionary. + Parameters: + fname: directory of a dataset in .csv format or data object created from loading dgb/tgb datasets + header: whether first line of data file is header + index: whether the first column is row indices + t_col: column indext for timestamps (0 or 2) + ts_sorted: if data are sorted based on timestamp + + Returns: + temp_edgelist: A dictionary of edges and their frequency at each time interval + """ + + start_col = 0 + if index: + start_col = 1 + t_col += 1 + + if t_col < 2: + u_col = t_col + 1 + else: + u_col = start_col + v_col = u_col + 1 + + cols_to_read = [u_col, v_col, t_col] + + if isinstance(fname, type) or isinstance(fname, object): + return _datasets_edgelist_loader(fname.data) + elif isinstance(fname, str): + return _load_edgelist(fname, cols_to_read, header=header) + else: + raise TypeError("Invalid input") + + +def _load_edgelist(fname, columns, header): + """ + read edges from the file and store them in a dictionary + Parameters: + fname: file address + columns: order of the nodes and timestamp + header: Whether the data file contains header + """ + try: + edgelist = open(fname, "r") + except: + raise FileNotFoundError("No such file or directory.") + edgelist.readline() + lines = list(edgelist.readlines()) + edgelist.close() + + u_idx, v_idx, ts_idx = columns + temp_edgelist = {} + unique_edges = {} + edges_list = [] + total_edges = 0 + sorted = True + previous_t = 0 + if header: + first_line = 1 + else: + first_line = 0 + for i in range(first_line, len(lines)): + line = lines[i] + values = line.split(',') + t = int(float(values[ts_idx])) + u = values[u_idx] + v = values[v_idx] + + if i == first_line: + curr_t = t + + # Check if the dataset is sorted + if t < previous_t: + sorted = False + previous_t = t + + if t not in temp_edgelist: + temp_edgelist[t] = [] + + # temp_edgelist[curr_t] = edges_list + # edges_list = [] + # curr_t = t + + temp_edgelist[t].append((u, v)) + if (u,v) not in unique_edges: + unique_edges[(u, v)] = 1 + total_edges += 1 + # temp_edgelist[curr_t] = edges_list + + if sorted is False: + print("edgelist not sorted, sorting dataset...") + myKeys = list(temp_edgelist.keys()) + myKeys.sort() + temp_edgelist = {i: temp_edgelist[i] for i in myKeys} + + print("Number of loaded edges: " + str(total_edges)) + print("Number of unique edges:" , len(unique_edges.keys())) + print("Available timestamps: ", len(temp_edgelist.keys())) + return temp_edgelist + +def _datasets_edgelist_loader(data) -> dict: + """ + load built-in datasets and tgb datasets + """ + temp_edgelist = {} + total_edges = 0 + unique_edges = {} + first_line = 0 + previous_t = 0 + edges_list = [] + sorted = True + for line in data: + u = line[0] + v = line[1] + t = int(float(line[2])) + if first_line == 0: + curr_t = t + first_line += 1 + + # Check if the dataset is sorted + if t < previous_t: + sorted = False + previous_t = t + + if t != curr_t: + temp_edgelist[curr_t] = edges_list + edges_list = [] + curr_t = t + + edges_list.append((u, v)) + if (u,v) not in unique_edges: + unique_edges[(u, v)] = 1 + total_edges += 1 + temp_edgelist[curr_t] = edges_list + + if sorted is False: + print("Sorting dataset...") + myKeys = list(temp_edgelist.keys()) + myKeys.sort() + temp_edgelist = {i: temp_edgelist[i] for i in myKeys} + + print("Number of loaded edges: " + str(total_edges)) + print("Number of unique edges:" + str(len(unique_edges.keys()))) + print("Available timestamps: ", len(temp_edgelist.keys())) + + return temp_edgelist + + +def _load_edgelist_with_discretizer( + fname : str, + columns : list, + time_scale : Union[str , int] = 86400, + header : Optional[bool] = True) -> dict: + """ + load temporal edgelist into a dictionary + assumption: the edges are ordered in increasing order of their timestamp + ''' + the timestamp in the edgelist is based cardinal + more detail see here: https://github.com/srijankr/jodie + need to merge edges in a period of time into an interval + 86400 is # of secs in a day, good interval size + ''' + """ + # print("Info: Interval size:", interval_size) + edgelist = open(fname, "r") + edgelist.readline() + lines = list(edgelist.readlines()) + edgelist.close() + + + u_idx, v_idx, ts_idx = columns + + if isinstance(time_scale, str): + if time_scale == "daily": + interval_size = 86400 + elif time_scale == "weekly": + interval_size = 86400 * 7 + elif time_scale == "monthly": + interval_size = 86400 * 30 + elif time_scale == "yearly": + interval_size = 86400* 365 + elif isinstance(time_scale, int): + last_line = lines[-1] + values = last_line.split(',') + total_time = float(values[ts_idx]) + interval_size = int(total_time / (time_scale-1)) + else: + raise TypeError("Invalid time interval") + + temporal_edgelist = {} + total_n_edges = 0 + + if header: + first_line = 1 + else: + first_line = 0 + + + for i in range(first_line, len(lines)): + line = lines[i] + values = line.split(',') + + total_n_edges += 1 + # values = line.strip().split(',') + u = values[u_idx] # source node + v = values[v_idx] # destination node + ts = float(values[ts_idx]) # timestamp + ts_bin_id = int(ts / interval_size) + if ts_bin_id not in temporal_edgelist: + temporal_edgelist[ts_bin_id] = {} + temporal_edgelist[ts_bin_id][(u, v)] = 1 + else: + if (u, v) not in temporal_edgelist[ts_bin_id]: + temporal_edgelist[ts_bin_id][(u, v)] = 1 + else: + temporal_edgelist[ts_bin_id][(u, v)] += 1 + + print("Loading edge-list: Maximum timestamp is ", ts) + print("Loading edge-list: Maximum timestamp-bin-id is", ts_bin_id) + print("Loading edge-list: Total number of edges:", total_n_edges) + return temporal_edgelist + + + + diff --git a/tgx/io/write.py b/tgx/io/write.py index 4988d6e..84da08f 100644 --- a/tgx/io/write.py +++ b/tgx/io/write.py @@ -1,42 +1,42 @@ -# # a = {1:[(1,3), (4,5), (5,6)], -# # 2:[(2,5)]} -# # print(a.items()) - - -# Details = {"Destination": "China", -# "Nationality": "Italian", "Age": []} - -# print("Original:", Details) - -# # appending the list -# Details["Age"] += [20, "Twenty"] -# print("Modified:", Details) - - -# a1 = [(1,2,3), (1,2,3), (2,3,4)] -# d={} -# lis = [] -# t = 1 -# for i in a1: -# q1=i[0] -# q2=i[1] -# q3=i[2] -# if q1 not in d: -# d[q1] = [] -# print(d) -# d[q1].append((q2,q3)) -# if q1 != t: -# d[t] = lis -# lis=[] -# t = q1 -# lis.append((q2,q3)) -# d[t] = lis -# print(d) - - -# for i, l in a.items(): -# for s in l: -# print (i, s[0], s[1]) - -def write_csv(): +# # a = {1:[(1,3), (4,5), (5,6)], +# # 2:[(2,5)]} +# # print(a.items()) + + +# Details = {"Destination": "China", +# "Nationality": "Italian", "Age": []} + +# print("Original:", Details) + +# # appending the list +# Details["Age"] += [20, "Twenty"] +# print("Modified:", Details) + + +# a1 = [(1,2,3), (1,2,3), (2,3,4)] +# d={} +# lis = [] +# t = 1 +# for i in a1: +# q1=i[0] +# q2=i[1] +# q3=i[2] +# if q1 not in d: +# d[q1] = [] +# print(d) +# d[q1].append((q2,q3)) +# if q1 != t: +# d[t] = lis +# lis=[] +# t = q1 +# lis.append((q2,q3)) +# d[t] = lis +# print(d) + + +# for i, l in a.items(): +# for s in l: +# print (i, s[0], s[1]) + +def write_csv(): pass \ No newline at end of file diff --git a/tgx/utils/.ipynb_checkpoints/edgelist-checkpoint.py b/tgx/utils/.ipynb_checkpoints/edgelist-checkpoint.py index ef6dad5..45d9a1d 100644 --- a/tgx/utils/.ipynb_checkpoints/edgelist-checkpoint.py +++ b/tgx/utils/.ipynb_checkpoints/edgelist-checkpoint.py @@ -1,64 +1,64 @@ - -def edgelist_discritizer(edgelist, - unique_ts, - time_interval = None, - max_intervals = 200): - - total_time = unique_ts[-1] - unique_ts[0] - if time_interval is not None: - if isinstance(time_interval, str): - if time_interval == "daily": - interval_size = 86400 - elif time_interval == "weekly": - interval_size = 86400 * 7 - elif time_interval == "monthly": - interval_size = 86400 * 30 - elif time_interval == "yearly": - interval_size = 86400* 365 - if int(total_time / interval_size) > max_intervals: - user_input = input("Too many timestamps, discretizing data to 200 timestamps, do you want to proceed?(y/n): ") - if user_input.lower() == 'n': - print('Cannot proceed to TEA and TET plot') - exit() - else: - interval_size = max_intervals - elif isinstance(time_interval, int): - if time_interval > max_intervals: - raise ValueError(f"The maximum number of time intervals is {max_intervals}.") - else: - interval_size = int(total_time / (time_interval)) - - else: - raise TypeError("Invalid time interval") - else: - user_input = input(f"discretizing data to {max_intervals} timestamps, do you want to proceed?(y/n): ") - if user_input.lower() == 'n': - print('Cannot proceed to TEA and TET plot') - exit() - else: - interval_size = int(total_time / 100) - num_intervals = int(total_time/interval_size) - print(f'Discretizing data to {num_intervals} timestamps...') - if num_intervals == 0: - print("Warning! Only one timestamp exist in the data.") - updated_edgelist = {} - new_ts = {} - curr_t = 0 - for ts, edge_data in edgelist.items(): - bin_ts = int(ts / interval_size) - if bin_ts >= num_intervals: - bin_ts -= 1 - - if bin_ts not in new_ts: - new_ts[bin_ts] = curr_t - curr_t += 1 - - if new_ts[bin_ts] not in updated_edgelist: - updated_edgelist[new_ts[bin_ts]] = {} - - for (u,v), n in edge_data.items(): - if (u, v) not in updated_edgelist[new_ts[bin_ts]]: - updated_edgelist[new_ts[bin_ts]][(u, v)] = n - else: - updated_edgelist[new_ts[bin_ts]][(u, v)] += n + +def edgelist_discritizer(edgelist, + unique_ts, + time_interval = None, + max_intervals = 200): + + total_time = unique_ts[-1] - unique_ts[0] + if time_interval is not None: + if isinstance(time_interval, str): + if time_interval == "daily": + interval_size = 86400 + elif time_interval == "weekly": + interval_size = 86400 * 7 + elif time_interval == "monthly": + interval_size = 86400 * 30 + elif time_interval == "yearly": + interval_size = 86400* 365 + if int(total_time / interval_size) > max_intervals: + user_input = input("Too many timestamps, discretizing data to 200 timestamps, do you want to proceed?(y/n): ") + if user_input.lower() == 'n': + print('Cannot proceed to TEA and TET plot') + exit() + else: + interval_size = max_intervals + elif isinstance(time_interval, int): + if time_interval > max_intervals: + raise ValueError(f"The maximum number of time intervals is {max_intervals}.") + else: + interval_size = int(total_time / (time_interval)) + + else: + raise TypeError("Invalid time interval") + else: + user_input = input(f"discretizing data to {max_intervals} timestamps, do you want to proceed?(y/n): ") + if user_input.lower() == 'n': + print('Cannot proceed to TEA and TET plot') + exit() + else: + interval_size = int(total_time / 100) + num_intervals = int(total_time/interval_size) + print(f'Discretizing data to {num_intervals} timestamps...') + if num_intervals == 0: + print("Warning! Only one timestamp exist in the data.") + updated_edgelist = {} + new_ts = {} + curr_t = 0 + for ts, edge_data in edgelist.items(): + bin_ts = int(ts / interval_size) + if bin_ts >= num_intervals: + bin_ts -= 1 + + if bin_ts not in new_ts: + new_ts[bin_ts] = curr_t + curr_t += 1 + + if new_ts[bin_ts] not in updated_edgelist: + updated_edgelist[new_ts[bin_ts]] = {} + + for (u,v), n in edge_data.items(): + if (u, v) not in updated_edgelist[new_ts[bin_ts]]: + updated_edgelist[new_ts[bin_ts]][(u, v)] = n + else: + updated_edgelist[new_ts[bin_ts]][(u, v)] += n return updated_edgelist \ No newline at end of file diff --git a/tgx/utils/.ipynb_checkpoints/graph_stat-checkpoint.py b/tgx/utils/.ipynb_checkpoints/graph_stat-checkpoint.py index ee6b41f..29e3e3d 100644 --- a/tgx/utils/.ipynb_checkpoints/graph_stat-checkpoint.py +++ b/tgx/utils/.ipynb_checkpoints/graph_stat-checkpoint.py @@ -1,82 +1,82 @@ -from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts - -__all__ = ["average_degree_per_ts", - "nodes_per_ts", - "edges_per_ts", - "nodes_and_edges_per_ts"] - - -def average_degree_per_ts(graph: list, - total_nodes: int, - network_name: str, - plot_path: str = None) -> None: - ''' - input: a list containing graph snapshots - ''' - print("Plotting average degree per timestamp") - ave_degree = _calculate_average_degree_per_ts(graph, total_nodes) - filename = f"{network_name}_ave_degree_per_ts" - plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = plot_path) - print("Plotting Done!") - return - - -def nodes_per_ts(graph: list, - network_name: str, - plot_path: str = None) -> None: - ''' - input: a list containing graph snapshots - ''' - print("Plotting number of nodes per timestamp") - active_nodes = _calculate_node_per_ts(graph) - filename = f"{network_name}_nodes_per_ts" - plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = plot_path) - print("Plotting Done!") - return - -def edges_per_ts(graph: list, - plot_path: str, - network_name: str) -> None: - ''' - input: a list containing graph snapshots - ''' - print("Plotting number of edges per timestamp") - active_edges = _calculate_edge_per_ts(graph) - filename = f"{network_name}_edges_per_ts" - plot_for_snapshots(active_edges, plot_path, filename, "Number of edges") - print("Plotting Done!") - return - -def nodes_and_edges_per_ts(graph: list, - network_name: str, - plot_path: str = None): - - edges = _calculate_edge_per_ts(graph) - nodes = _calculate_node_per_ts(graph) - ts = list(range(0, len(graph))) - - return plot_nodes_edges_per_ts(edges, nodes, ts, network_name, plot_path = plot_path) - - -def _calculate_average_degree_per_ts(graph, total_nodes): - total_ts = len(graph) - ave_degree = [] - for t1 in range(total_ts): - num_edges = graph[t1].number_of_edges() - ave_degree.append(num_edges*2/ total_nodes) - return ave_degree - - -def _calculate_node_per_ts(graph): - active_nodes = [] - for ts in range(len(graph)): - active_nodes.append(graph[ts].number_of_nodes()) - return active_nodes - -def _calculate_edge_per_ts(graph): - active_edges = [] - for ts in range(len(graph)): - active_edges.append(graph[ts].number_of_edges()) - return active_edges - - +from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts + +__all__ = ["average_degree_per_ts", + "nodes_per_ts", + "edges_per_ts", + "nodes_and_edges_per_ts"] + + +def average_degree_per_ts(graph: list, + total_nodes: int, + network_name: str, + plot_path: str = None) -> None: + ''' + input: a list containing graph snapshots + ''' + print("Plotting average degree per timestamp") + ave_degree = _calculate_average_degree_per_ts(graph, total_nodes) + filename = f"{network_name}_ave_degree_per_ts" + plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = plot_path) + print("Plotting Done!") + return + + +def nodes_per_ts(graph: list, + network_name: str, + plot_path: str = None) -> None: + ''' + input: a list containing graph snapshots + ''' + print("Plotting number of nodes per timestamp") + active_nodes = _calculate_node_per_ts(graph) + filename = f"{network_name}_nodes_per_ts" + plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = plot_path) + print("Plotting Done!") + return + +def edges_per_ts(graph: list, + plot_path: str, + network_name: str) -> None: + ''' + input: a list containing graph snapshots + ''' + print("Plotting number of edges per timestamp") + active_edges = _calculate_edge_per_ts(graph) + filename = f"{network_name}_edges_per_ts" + plot_for_snapshots(active_edges, plot_path, filename, "Number of edges") + print("Plotting Done!") + return + +def nodes_and_edges_per_ts(graph: list, + network_name: str, + plot_path: str = None): + + edges = _calculate_edge_per_ts(graph) + nodes = _calculate_node_per_ts(graph) + ts = list(range(0, len(graph))) + + return plot_nodes_edges_per_ts(edges, nodes, ts, network_name, plot_path = plot_path) + + +def _calculate_average_degree_per_ts(graph, total_nodes): + total_ts = len(graph) + ave_degree = [] + for t1 in range(total_ts): + num_edges = graph[t1].number_of_edges() + ave_degree.append(num_edges*2/ total_nodes) + return ave_degree + + +def _calculate_node_per_ts(graph): + active_nodes = [] + for ts in range(len(graph)): + active_nodes.append(graph[ts].number_of_nodes()) + return active_nodes + +def _calculate_edge_per_ts(graph): + active_edges = [] + for ts in range(len(graph)): + active_edges.append(graph[ts].number_of_edges()) + return active_edges + + diff --git a/tgx/utils/.ipynb_checkpoints/plotting_utils-checkpoint.py b/tgx/utils/.ipynb_checkpoints/plotting_utils-checkpoint.py index 3fd80ad..efa26fa 100644 --- a/tgx/utils/.ipynb_checkpoints/plotting_utils-checkpoint.py +++ b/tgx/utils/.ipynb_checkpoints/plotting_utils-checkpoint.py @@ -1,101 +1,101 @@ -import datetime -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np - -def create_ts_list(start, end, metric=None, interval=None): - if metric == "Unix" or metric == "unix" or metric == "UNIX": - start = datetime.datetime.fromtimestamp(start).date() - end = datetime.datetime.fromtimestamp(end).date() - if interval == 'daily': - date_list = pd.date_range(start = start, end = end, freq="D") - elif interval == "month": - date_list = pd.date_range(start = start, end = end, freq="M") - elif interval == "year": - date_list = pd.date_range(start = start, end = end, freq="Y") - timelist = [] - for dates in date_list: - timelist.append(dates.strftime("%Y/%m/%d")) - else: - timelist = list(range(start, end, interval)) - # print(timelist) - return timelist - - - -def plot_nodes_edges_per_ts(edges: list, - nodes: list, - ts: list, - network_name: str, - plot_path: str = None, - ylabel_1: str = 'Edges per Timestamp', - ylabel_2: str = 'Nodes per Timestamp'): - """ - Plot nodes and edges per timestamp in one figure - Parameters: - edges: A list containing number of edges per timestamp - nodes: A list containing number of nodes per timestamp - ts: list of timestamps - network_name: Name of the network to be used in the output file name - plot_path: Path to save the output figure - ylabel_1: Label for the edges per timestamp line - ylabel_2: Label for the nodes per timestamp line - """ - fig = plt.figure(facecolor='w', figsize=(11, 6)) - ax1 = fig.add_subplot(111) - ax2 = ax1.twinx() - - c1, = ax1.plot(ts, edges, color='black', lw=3, label=ylabel_1) - c2, = ax2.plot(ts, nodes, color='gray', linestyle='dashed', lw=3, label=ylabel_2) - curves = [c1, c2] - ax1.legend(curves, [curve.get_label() for curve in curves], fontsize = 18) - ax1.set_xlabel('Time', fontsize=20) - ax1.set_ylabel(ylabel_1, fontsize=20) - ax2.set_ylabel(ylabel_2, fontsize=20) - ax1.tick_params(labelsize=20) - ax2.tick_params(labelsize=20) - ax1.set_ylim(0) - ax2.set_ylim(0) - ax1.set_xlim(0, len(ts)-1) - if plot_path is not None: - filename = f"{network_name}_node&edge_per_ts" - plt.savefig(f'{plot_path}/{filename}') - plt.show() - -def plot_for_snapshots(data: list, - filename: str, - y_title: str, - show_ave: bool=True, - plot_path:str = None, - plot_title:str = None): - ''' - Plot a variable for different timestamps - Parameters: - data: A list of desired variable to be plotted - filename: Name of the output file name - y_title: Title of the y axis - show_ave: Whether to plot a line showing the average of the variable over all timestamps - plot_path: The path to save the output file - ''' - ts = list(range(0, len(data))) - # plt.rcParams["font.family"] = "Times New Roman" - fig = plt.figure(facecolor='w', figsize=(9,6)) - ax = fig.add_subplot(111) - ax.plot(ts, data, color='black', lw=3) - - ax.set_xlabel('Time', fontsize=20) - ax.set_ylabel(y_title, fontsize=20) - ax.tick_params(labelsize=20) - # ax.set_ylim(0, 7.5) - ax.set_xlim(0, len(ts)-1) - ax.set_title(plot_title, fontsize=20) - if show_ave: - ave_deg = [np.average(data) for i in range(len(ts))] - ax.plot(ts, ave_deg, color='#ca0020', linestyle='dashed', lw=3) - if plot_path is not None: - plt.savefig(f'{plot_path}/{filename}') - plt.show() - -if __name__ == "__main__": - create_ts_list(86400, 86400*365, "unix", "month") +import datetime +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +def create_ts_list(start, end, metric=None, interval=None): + if metric == "Unix" or metric == "unix" or metric == "UNIX": + start = datetime.datetime.fromtimestamp(start).date() + end = datetime.datetime.fromtimestamp(end).date() + if interval == 'daily': + date_list = pd.date_range(start = start, end = end, freq="D") + elif interval == "month": + date_list = pd.date_range(start = start, end = end, freq="M") + elif interval == "year": + date_list = pd.date_range(start = start, end = end, freq="Y") + timelist = [] + for dates in date_list: + timelist.append(dates.strftime("%Y/%m/%d")) + else: + timelist = list(range(start, end, interval)) + # print(timelist) + return timelist + + + +def plot_nodes_edges_per_ts(edges: list, + nodes: list, + ts: list, + network_name: str, + plot_path: str = None, + ylabel_1: str = 'Edges per Timestamp', + ylabel_2: str = 'Nodes per Timestamp'): + """ + Plot nodes and edges per timestamp in one figure + Parameters: + edges: A list containing number of edges per timestamp + nodes: A list containing number of nodes per timestamp + ts: list of timestamps + network_name: Name of the network to be used in the output file name + plot_path: Path to save the output figure + ylabel_1: Label for the edges per timestamp line + ylabel_2: Label for the nodes per timestamp line + """ + fig = plt.figure(facecolor='w', figsize=(11, 6)) + ax1 = fig.add_subplot(111) + ax2 = ax1.twinx() + + c1, = ax1.plot(ts, edges, color='black', lw=3, label=ylabel_1) + c2, = ax2.plot(ts, nodes, color='gray', linestyle='dashed', lw=3, label=ylabel_2) + curves = [c1, c2] + ax1.legend(curves, [curve.get_label() for curve in curves], fontsize = 18) + ax1.set_xlabel('Time', fontsize=20) + ax1.set_ylabel(ylabel_1, fontsize=20) + ax2.set_ylabel(ylabel_2, fontsize=20) + ax1.tick_params(labelsize=20) + ax2.tick_params(labelsize=20) + ax1.set_ylim(0) + ax2.set_ylim(0) + ax1.set_xlim(0, len(ts)-1) + if plot_path is not None: + filename = f"{network_name}_node&edge_per_ts" + plt.savefig(f'{plot_path}/{filename}') + plt.show() + +def plot_for_snapshots(data: list, + filename: str, + y_title: str, + show_ave: bool=True, + plot_path:str = None, + plot_title:str = None): + ''' + Plot a variable for different timestamps + Parameters: + data: A list of desired variable to be plotted + filename: Name of the output file name + y_title: Title of the y axis + show_ave: Whether to plot a line showing the average of the variable over all timestamps + plot_path: The path to save the output file + ''' + ts = list(range(0, len(data))) + # plt.rcParams["font.family"] = "Times New Roman" + fig = plt.figure(facecolor='w', figsize=(9,6)) + ax = fig.add_subplot(111) + ax.plot(ts, data, color='black', lw=3) + + ax.set_xlabel('Time', fontsize=20) + ax.set_ylabel(y_title, fontsize=20) + ax.tick_params(labelsize=20) + # ax.set_ylim(0, 7.5) + ax.set_xlim(0, len(ts)-1) + ax.set_title(plot_title, fontsize=20) + if show_ave: + ave_deg = [np.average(data) for i in range(len(ts))] + ax.plot(ts, ave_deg, color='#ca0020', linestyle='dashed', lw=3) + if plot_path is not None: + plt.savefig(f'{plot_path}/{filename}') + plt.show() + +if __name__ == "__main__": + create_ts_list(86400, 86400*365, "unix", "month") create_ts_list(2015, 2022, interval=2) \ No newline at end of file diff --git a/tgx/utils/__init__.py b/tgx/utils/__init__.py index 3c9b13a..bf94939 100644 --- a/tgx/utils/__init__.py +++ b/tgx/utils/__init__.py @@ -1,5 +1,5 @@ -# from tgx.utils.graph_stat import * -# from tgx.utils.graph_utils import * -# from tgx.utils.plotting_utils import * - +# from tgx.utils.graph_stat import * +# from tgx.utils.graph_utils import * +# from tgx.utils.plotting_utils import * + # from . import * \ No newline at end of file diff --git a/tgx/utils/graph_utils.py b/tgx/utils/graph_utils.py index 99bd4d0..8267e5e 100644 --- a/tgx/utils/graph_utils.py +++ b/tgx/utils/graph_utils.py @@ -1,282 +1,282 @@ -import numpy as np -from typing import Union, Optional - -__all__ = ["train_test_split", - "discretize_edges", - "subsampling", - "node_list", - "is_discretized", - "frequency_count"] - -SEC_IN_MIN = 60 -SEC_IN_HOUR = 3600 -SEC_IN_DAY = 86400 -SEC_IN_WEEK = 86400 * 7 -SEC_IN_MONTH = 86400 * 30 -SEC_IN_YEAR = 86400 * 365 - -# helper function to do ceiling divison, i.e. 5/2 = 3 -def ceiling_division(n, d): - q, r = divmod(n, d) - return q + bool(r) - - - -def discretize_edges(edgelist: dict, - time_scale: Union[int,str], - store_unix: Optional[bool] = False) -> list: - """ - util function for discretizing edgelist, expected timestamp on edges are unixtimestamp - this func supports discretization of edge timestamp - 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. - 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly", the starting time of the dataset is consider the start of the first interval - Parameters: - edgelist: dict, dictionary of edges - time_scale: int or str, time interval to discretize the graph - store_unix: bool, whether to return the converted timestamps in unix format - Returns: - output list: the first item in the list is always the updated edgelist (dict, dictionary of edges with discretized timestamps) and the second item is the converted timestamps in unix format (list) if store_unix is True - """ - unique_ts = list(edgelist.keys()) - total_time = unique_ts[-1] - unique_ts[0] - if time_scale is not None: - if isinstance(time_scale, int): - interval_size = total_time // time_scale #integer timestamp of the bin, discounting any bin that has a smaller duration than others - elif isinstance(time_scale, str): - if time_scale == "minutely": - interval_size = SEC_IN_MIN - elif time_scale == "hourly": - interval_size = SEC_IN_HOUR - elif time_scale == "daily": - interval_size = SEC_IN_DAY - elif time_scale == "weekly": - interval_size = SEC_IN_WEEK - elif time_scale == "monthly": - interval_size = SEC_IN_MONTH - elif time_scale == "yearly": - interval_size = SEC_IN_YEAR - else: - raise TypeError("Invalid time interval") - else: - raise TypeError("Please provide a time interval") - - num_time_scale = ceiling_division(total_time, interval_size) - print(f'Discretizing data to {num_time_scale} timestamps...') - - updated_edgelist = {} - - if (store_unix): - unix_dict = [] - start_time = int(unique_ts[0]) - for ts, edges_list in edgelist.items(): - bin_ts = ceiling_division(ts, interval_size) #will correctly put edges into the last bin - - for edge in edges_list: - if bin_ts not in updated_edgelist: - updated_edgelist[bin_ts] = [edge] - else: - updated_edgelist[bin_ts].append(edge) - - if (store_unix): - unix_ts = start_time + int(ts // interval_size) * interval_size #round to the nearest start time - unix_ts = int(unix_ts) - unix_dict.extend([unix_ts] * len(edges_list)) - - output = [updated_edgelist] - if (store_unix): - output.append(unix_dict) - return output - - -# def edgelist_discritizer(edgelist: dict, -# time_scale: Union[str, int]): -# """ -# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp -# this func supports discretization in two different ways -# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. -# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" -# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. - -# Parameters: -# edgelist: dict, dictionary of edges -# time_scale: str or int, time interval to discretize the graph -# Returns: -# updated_edgelist: dict, dictionary of edges with discretized timestamps -# """ - -# unique_ts = list(edgelist.keys()) - -# total_time = unique_ts[-1] - unique_ts[0] -# if time_scale is not None: -# if isinstance(time_scale, str): -# if time_scale == "hourly": -# interval_size = SEC_IN_HOUR -# elif time_scale == "daily": -# interval_size = SEC_IN_DAY -# elif time_scale == "weekly": -# interval_size = SEC_IN_WEEK -# elif time_scale == "monthly": -# interval_size = SEC_IN_MONTH -# elif time_scale == "yearly": -# interval_size = SEC_IN_YEAR -# elif isinstance(time_scale, int): -# interval_size = int(total_time / (time_scale)) -# else: -# raise TypeError("Invalid time interval") -# else: -# raise TypeError("Please provide a time interval") -# num_time_scale = int(total_time/interval_size) -# print(f'Discretizing data to {num_time_scale} timestamps...') -# # if num_time_scale == 0: -# # print("Warning! Only one timestamp exist in the data.") - -# updated_edgelist = {} -# for ts, edges_list in edgelist.items(): -# bin_ts = int(ts / interval_size) -# if bin_ts >= num_time_scale: -# bin_ts -= 1 - -# for edge in edges_list: -# if bin_ts not in updated_edgelist: -# updated_edgelist[bin_ts] = [] -# updated_edgelist[bin_ts].append(edge) -# print("Discretization Done..!") -# return updated_edgelist - - - - - - - -def subsampling(graph: Union[object, dict], - node_list: Optional[list] = [], - random_selection: Optional[bool] = False, - N: Optional[int] = 100 - ) -> dict: - """ - Subsampling a part of graph by only monitoring the contacts from specific nodes' list - - Parameters: - graph: graph object or edgelist dict - node_list: list, a set of nodes to extract their contacts from the graph - random_selection: bool, wether randomly subsample a set of nodes from graph - N: int, number of nodes to be randomly sampled from graph - - Returns: - new_edgelist: dict, a dictionary of edges corresponding to nodes in the node_list - """ - print("Generate graph subsample...") - if isinstance(graph, dict): - edgelist = graph - nodes = node_list(graph) - else: - edgelist = graph.edgelist - nodes = graph.nodes() - - if random_selection: - node_list = list(np.random.choice(nodes, size = N, replace = False)) - - new_edgelist = {} - for t, edge_data in edgelist.items(): - for (u,v), f in edge_data.items(): - if u in node_list or v in node_list: - if t not in new_edgelist: - new_edgelist[t] = {} - new_edgelist[t][(u, v)] = f - else: - new_edgelist[t][(u, v)] = f - return new_edgelist - -def frequency_count(edgelist: dict): - new_edgelist = {} - - for t, edges_list in edgelist.items(): - for edge in edges_list: - (u, v) = edge - - # Check if this is the first edge occurning in this timestamp - if t not in new_edgelist: - new_edgelist[t] = {} - new_edgelist[t][(u, v)] = 1 - - else: - if (u, v) not in new_edgelist[t]: - new_edgelist[t][(u, v)] = 1 # If the edge was not occured in this timestamp before - else: - new_edgelist[t][(u, v)] += 1 - - return new_edgelist - -def node_list(dict_edgelist: dict) -> list: - - """ - create a list of nodes from edgelist dictionary - """ - node_list = {} - for _, edge_data in dict_edgelist.items(): - for (u,v), _ in edge_data.items(): - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return list(node_list.keys()) - - -def train_test_split(data : dict, - val : bool = False, - ratio : list = [85, 15]) -> dict: - """ - Generate train/test split for the data - - Parameters: - data:dictionary of data - val: whether we want to have a validation split as well - ratio: list indication the ratio of the data in split. Sum of the list components should be 100. - - Returns: - two (train/test) or three (train/val/test) data dictionaries - """ - sum = 0 - for i in ratio: - sum += i - if sum != 100: - raise ValueError("invalid train/test split ratio. Sum of the ratios should be 100.") - - if val and len(ratio) != 3: - raise Exception("Provide train/val/test ratio") - elif not val and len(ratio) == 3: - print("Warning! Data is being splitted to train and test only!") - - data_len = len(data) - train_split = int(data_len * ratio[0] / 100) - train_data = {k: v for k, v in data.items() if k < train_split} - if val: - val_split = int(data_len * ratio[1] / 100) + train_split - val_data = {k: v for k, v in data.items() if train_split <= k < val_split} - test_data = {k: v for k, v in data.items() if val_split <= k <= data_len} - return train_data, val_data, test_data - - else: - test_data = {k: v for k, v in data.items() if train_split <= k <= data_len} - return train_data, test_data - - -def is_discretized(edgelist: Optional[dict], - max_timestamps: Optional[int] = 10000) -> bool: - r""" - Check if an edgelist is discretized or not. - """ - timestamps = list(edgelist.keys()) - discretized = True - if len(timestamps) > max_timestamps: - discretized = False - - return discretized - -def list2csv(lst: list, - fname: str, - delimiter: str = ",", - fmt: str = '%i'): - out_list = np.array(lst) +import numpy as np +from typing import Union, Optional + +__all__ = ["train_test_split", + "discretize_edges", + "subsampling", + "node_list", + "is_discretized", + "frequency_count"] + +SEC_IN_MIN = 60 +SEC_IN_HOUR = 3600 +SEC_IN_DAY = 86400 +SEC_IN_WEEK = 86400 * 7 +SEC_IN_MONTH = 86400 * 30 +SEC_IN_YEAR = 86400 * 365 + +# helper function to do ceiling divison, i.e. 5/2 = 3 +def ceiling_division(n, d): + q, r = divmod(n, d) + return q + bool(r) + + + +def discretize_edges(edgelist: dict, + time_scale: Union[int,str], + store_unix: Optional[bool] = False) -> list: + """ + util function for discretizing edgelist, expected timestamp on edges are unixtimestamp + this func supports discretization of edge timestamp + 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. + 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly", the starting time of the dataset is consider the start of the first interval + Parameters: + edgelist: dict, dictionary of edges + time_scale: int or str, time interval to discretize the graph + store_unix: bool, whether to return the converted timestamps in unix format + Returns: + output list: the first item in the list is always the updated edgelist (dict, dictionary of edges with discretized timestamps) and the second item is the converted timestamps in unix format (list) if store_unix is True + """ + unique_ts = list(edgelist.keys()) + total_time = unique_ts[-1] - unique_ts[0] + if time_scale is not None: + if isinstance(time_scale, int): + interval_size = total_time // time_scale #integer timestamp of the bin, discounting any bin that has a smaller duration than others + elif isinstance(time_scale, str): + if time_scale == "minutely": + interval_size = SEC_IN_MIN + elif time_scale == "hourly": + interval_size = SEC_IN_HOUR + elif time_scale == "daily": + interval_size = SEC_IN_DAY + elif time_scale == "weekly": + interval_size = SEC_IN_WEEK + elif time_scale == "monthly": + interval_size = SEC_IN_MONTH + elif time_scale == "yearly": + interval_size = SEC_IN_YEAR + else: + raise TypeError("Invalid time interval") + else: + raise TypeError("Please provide a time interval") + + num_time_scale = ceiling_division(total_time, interval_size) + print(f'Discretizing data to {num_time_scale} timestamps...') + + updated_edgelist = {} + + if (store_unix): + unix_dict = [] + start_time = int(unique_ts[0]) + for ts, edges_list in edgelist.items(): + bin_ts = ceiling_division(ts, interval_size) #will correctly put edges into the last bin + + for edge in edges_list: + if bin_ts not in updated_edgelist: + updated_edgelist[bin_ts] = [edge] + else: + updated_edgelist[bin_ts].append(edge) + + if (store_unix): + unix_ts = start_time + int(ts // interval_size) * interval_size #round to the nearest start time + unix_ts = int(unix_ts) + unix_dict.extend([unix_ts] * len(edges_list)) + + output = [updated_edgelist] + if (store_unix): + output.append(unix_dict) + return output + + +# def edgelist_discritizer(edgelist: dict, +# time_scale: Union[str, int]): +# """ +# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp +# this func supports discretization in two different ways +# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. +# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" +# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. + +# Parameters: +# edgelist: dict, dictionary of edges +# time_scale: str or int, time interval to discretize the graph +# Returns: +# updated_edgelist: dict, dictionary of edges with discretized timestamps +# """ + +# unique_ts = list(edgelist.keys()) + +# total_time = unique_ts[-1] - unique_ts[0] +# if time_scale is not None: +# if isinstance(time_scale, str): +# if time_scale == "hourly": +# interval_size = SEC_IN_HOUR +# elif time_scale == "daily": +# interval_size = SEC_IN_DAY +# elif time_scale == "weekly": +# interval_size = SEC_IN_WEEK +# elif time_scale == "monthly": +# interval_size = SEC_IN_MONTH +# elif time_scale == "yearly": +# interval_size = SEC_IN_YEAR +# elif isinstance(time_scale, int): +# interval_size = int(total_time / (time_scale)) +# else: +# raise TypeError("Invalid time interval") +# else: +# raise TypeError("Please provide a time interval") +# num_time_scale = int(total_time/interval_size) +# print(f'Discretizing data to {num_time_scale} timestamps...') +# # if num_time_scale == 0: +# # print("Warning! Only one timestamp exist in the data.") + +# updated_edgelist = {} +# for ts, edges_list in edgelist.items(): +# bin_ts = int(ts / interval_size) +# if bin_ts >= num_time_scale: +# bin_ts -= 1 + +# for edge in edges_list: +# if bin_ts not in updated_edgelist: +# updated_edgelist[bin_ts] = [] +# updated_edgelist[bin_ts].append(edge) +# print("Discretization Done..!") +# return updated_edgelist + + + + + + + +def subsampling(graph: Union[object, dict], + node_list: Optional[list] = [], + random_selection: Optional[bool] = False, + N: Optional[int] = 100 + ) -> dict: + """ + Subsampling a part of graph by only monitoring the contacts from specific nodes' list + + Parameters: + graph: graph object or edgelist dict + node_list: list, a set of nodes to extract their contacts from the graph + random_selection: bool, wether randomly subsample a set of nodes from graph + N: int, number of nodes to be randomly sampled from graph + + Returns: + new_edgelist: dict, a dictionary of edges corresponding to nodes in the node_list + """ + print("Generate graph subsample...") + if isinstance(graph, dict): + edgelist = graph + nodes = node_list(graph) + else: + edgelist = graph.edgelist + nodes = graph.nodes() + + if random_selection: + node_list = list(np.random.choice(nodes, size = N, replace = False)) + + new_edgelist = {} + for t, edge_data in edgelist.items(): + for (u,v), f in edge_data.items(): + if u in node_list or v in node_list: + if t not in new_edgelist: + new_edgelist[t] = {} + new_edgelist[t][(u, v)] = f + else: + new_edgelist[t][(u, v)] = f + return new_edgelist + +def frequency_count(edgelist: dict): + new_edgelist = {} + + for t, edges_list in edgelist.items(): + for edge in edges_list: + (u, v) = edge + + # Check if this is the first edge occurning in this timestamp + if t not in new_edgelist: + new_edgelist[t] = {} + new_edgelist[t][(u, v)] = 1 + + else: + if (u, v) not in new_edgelist[t]: + new_edgelist[t][(u, v)] = 1 # If the edge was not occured in this timestamp before + else: + new_edgelist[t][(u, v)] += 1 + + return new_edgelist + +def node_list(dict_edgelist: dict) -> list: + + """ + create a list of nodes from edgelist dictionary + """ + node_list = {} + for _, edge_data in dict_edgelist.items(): + for (u,v), _ in edge_data.items(): + if u not in node_list: + node_list[u] = 1 + if v not in node_list: + node_list[v] = 1 + return list(node_list.keys()) + + +def train_test_split(data : dict, + val : bool = False, + ratio : list = [85, 15]) -> dict: + """ + Generate train/test split for the data + + Parameters: + data:dictionary of data + val: whether we want to have a validation split as well + ratio: list indication the ratio of the data in split. Sum of the list components should be 100. + + Returns: + two (train/test) or three (train/val/test) data dictionaries + """ + sum = 0 + for i in ratio: + sum += i + if sum != 100: + raise ValueError("invalid train/test split ratio. Sum of the ratios should be 100.") + + if val and len(ratio) != 3: + raise Exception("Provide train/val/test ratio") + elif not val and len(ratio) == 3: + print("Warning! Data is being splitted to train and test only!") + + data_len = len(data) + train_split = int(data_len * ratio[0] / 100) + train_data = {k: v for k, v in data.items() if k < train_split} + if val: + val_split = int(data_len * ratio[1] / 100) + train_split + val_data = {k: v for k, v in data.items() if train_split <= k < val_split} + test_data = {k: v for k, v in data.items() if val_split <= k <= data_len} + return train_data, val_data, test_data + + else: + test_data = {k: v for k, v in data.items() if train_split <= k <= data_len} + return train_data, test_data + + +def is_discretized(edgelist: Optional[dict], + max_timestamps: Optional[int] = 10000) -> bool: + r""" + Check if an edgelist is discretized or not. + """ + timestamps = list(edgelist.keys()) + discretized = True + if len(timestamps) > max_timestamps: + discretized = False + + return discretized + +def list2csv(lst: list, + fname: str, + delimiter: str = ",", + fmt: str = '%i'): + out_list = np.array(lst) np.savetxt(fname, out_list, delimiter=delimiter, fmt=fmt) \ No newline at end of file diff --git a/tgx/utils/newstat.py b/tgx/utils/newstat.py index 4ce58fd..1df8cae 100644 --- a/tgx/utils/newstat.py +++ b/tgx/utils/newstat.py @@ -1,163 +1,163 @@ -from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map -import networkx as nx -import numpy as np -from tgx.utils.graph_utils import train_test_split -from typing import List, Dict - -__all__ = ["connected_components_per_ts", - "size_connected_components", - "get_avg_node_engagement", - "degree_density"] - - -def degree_density(graph: tuple, k: int = 10, network_name: str = None, plot_path: str = None) -> None: - r""" - Plot density map of node degrees per time window - Parameters: - graph_edgelist: Dictionary containing graph data - k: number of time windows - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - graph_edgelist = graph.data - degrees_by_k_list = [] - temp = [] - temp_idx = 0 - unique_ts = list(graph_edgelist.keys()) - - for ts in unique_ts: - e_at_this_ts = graph_edgelist[ts] - G = nx.MultiGraph() - - for e in e_at_this_ts: - G.add_edge(e[0], e[1]) - - nodes = G.nodes() - degrees = [G.degree[n] for n in nodes] - - if temp_idx None: - r""" - Plot number of connected components per timestamp - Parameters: - graph: a list containing graph snapshots - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - num_components = [] - for t in range(len(graph.data)): - edgelist_t = graph.data[t] - nodes_t = graph.edgelist_node_list(edgelist_t) - parent = {node: node for node in nodes_t} - - for edge in edgelist_t: - (u, v) = edge - _merge(u, v, parent) - - num = 0 - for u in nodes_t: - if parent[u] == u: - num += 1 - num_components.append(num) - - if network_name is not None: - filename = f"{network_name}_connected_components_per_ts" - else: - filename = "_connected_components_per_ts" - - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - return - - -def size_connected_components(graph: tuple) -> List[List]: - r""" - Calculate the sizes of connected components per timestamp - Returns: - list[list]: A list containing lists of sizes of connected components for each timestamp. - """ - component_sizes = [] - for t in range(len(graph.data)): - edgelist_t = graph.data[t] - nodes_t = graph.edgelist_node_list(edgelist_t) - parent = {node: node for node in nodes_t} - - for edge in edgelist_t: - (u, v) = edge - _merge(u, v, parent) - - component_sizes_t = {} - for u in nodes_t: - root = _find(u, parent) - if root not in component_sizes_t: - component_sizes_t[root] = 0 - component_sizes_t[root] += 1 - - component_sizes_t_list = list(component_sizes_t.values()) - component_sizes.append(component_sizes_t_list) - - return component_sizes - - -def get_avg_node_engagement(graph: tuple) -> List[int]: - r""" - Calculate the average node engagement per timestamp, - the average number of distinct nodes that establish - at least one new connection. - Parameters: - graph_edgelist: Dictionary containing graph data - """ - engaging_nodes = [] - previous_edges = set() - - for ts in range(len(graph.data)): - edgelist_t = graph.data[ts] - new_nodes = set() - - for edge in edgelist_t: - (u, v) = edge - if frozenset({u, v}) not in previous_edges: - if u not in new_nodes: - new_nodes.add(u) - if v not in new_nodes: - new_nodes.add(v) - - engaging_nodes.append(len(new_nodes)) - previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp - +from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map +import networkx as nx +import numpy as np +from tgx.utils.graph_utils import train_test_split +from typing import List, Dict + +__all__ = ["connected_components_per_ts", + "size_connected_components", + "get_avg_node_engagement", + "degree_density"] + + +def degree_density(graph: tuple, k: int = 10, network_name: str = None, plot_path: str = None) -> None: + r""" + Plot density map of node degrees per time window + Parameters: + graph_edgelist: Dictionary containing graph data + k: number of time windows + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + graph_edgelist = graph.data + degrees_by_k_list = [] + temp = [] + temp_idx = 0 + unique_ts = list(graph_edgelist.keys()) + + for ts in unique_ts: + e_at_this_ts = graph_edgelist[ts] + G = nx.MultiGraph() + + for e in e_at_this_ts: + G.add_edge(e[0], e[1]) + + nodes = G.nodes() + degrees = [G.degree[n] for n in nodes] + + if temp_idx None: + r""" + Plot number of connected components per timestamp + Parameters: + graph: a list containing graph snapshots + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + num_components = [] + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} + + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) + + num = 0 + for u in nodes_t: + if parent[u] == u: + num += 1 + num_components.append(num) + + if network_name is not None: + filename = f"{network_name}_connected_components_per_ts" + else: + filename = "_connected_components_per_ts" + + plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) + return + + +def size_connected_components(graph: tuple) -> List[List]: + r""" + Calculate the sizes of connected components per timestamp + Returns: + list[list]: A list containing lists of sizes of connected components for each timestamp. + """ + component_sizes = [] + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} + + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) + + component_sizes_t = {} + for u in nodes_t: + root = _find(u, parent) + if root not in component_sizes_t: + component_sizes_t[root] = 0 + component_sizes_t[root] += 1 + + component_sizes_t_list = list(component_sizes_t.values()) + component_sizes.append(component_sizes_t_list) + + return component_sizes + + +def get_avg_node_engagement(graph: tuple) -> List[int]: + r""" + Calculate the average node engagement per timestamp, + the average number of distinct nodes that establish + at least one new connection. + Parameters: + graph_edgelist: Dictionary containing graph data + """ + engaging_nodes = [] + previous_edges = set() + + for ts in range(len(graph.data)): + edgelist_t = graph.data[ts] + new_nodes = set() + + for edge in edgelist_t: + (u, v) = edge + if frozenset({u, v}) not in previous_edges: + if u not in new_nodes: + new_nodes.add(u) + if v not in new_nodes: + new_nodes.add(v) + + engaging_nodes.append(len(new_nodes)) + previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp + return engaging_nodes \ No newline at end of file diff --git a/tgx/utils/plotting_utils.py b/tgx/utils/plotting_utils.py index 57318be..3c3efad 100644 --- a/tgx/utils/plotting_utils.py +++ b/tgx/utils/plotting_utils.py @@ -1,137 +1,134 @@ -import datetime -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np -import matplotlib.colors as mcolors -from matplotlib.ticker import MaxNLocator - -def create_ts_list(start, end, metric=None, interval=None): - if metric == "Unix" or metric == "unix" or metric == "UNIX": - start = datetime.datetime.fromtimestamp(start).date() - end = datetime.datetime.fromtimestamp(end).date() - if interval == 'daily': - date_list = pd.date_range(start = start, end = end, freq="D") - elif interval == "month": - date_list = pd.date_range(start = start, end = end, freq="M") - elif interval == "year": - date_list = pd.date_range(start = start, end = end, freq="Y") - timelist = [] - for dates in date_list: - timelist.append(dates.strftime("%Y/%m/%d")) - else: - timelist = list(range(start, end, interval)) - # print(timelist) - return timelist - - - -def plot_nodes_edges_per_ts(edges: list, - nodes: list, - ts: list, - network_name: str, - plot_path: str = None, - ylabel_1: str = 'Edges per Timestamp', - ylabel_2: str = 'Nodes per Timestamp'): - """ - Plot nodes and edges per timestamp in one figure - Parameters: - edges: A list containing number of edges per timestamp - nodes: A list containing number of nodes per timestamp - ts: list of timestamps - network_name: Name of the network to be used in the output file name - plot_path: Path to save the output figure - ylabel_1: Label for the edges per timestamp line - ylabel_2: Label for the nodes per timestamp line - """ - fig = plt.figure(facecolor='w', figsize=(11, 6)) - ax1 = fig.add_subplot(111) - ax2 = ax1.twinx() - - c1, = ax1.plot(ts, edges, color='black', lw=3, label=ylabel_1) - c2, = ax2.plot(ts, nodes, color='gray', linestyle='dashed', lw=3, label=ylabel_2) - curves = [c1, c2] - ax1.legend(curves, [curve.get_label() for curve in curves], fontsize = 18) - ax1.set_xlabel('Time', fontsize=20) - ax1.set_ylabel(ylabel_1, fontsize=20) - ax2.set_ylabel(ylabel_2, fontsize=20) - ax1.tick_params(labelsize=20) - ax2.tick_params(labelsize=20) - ax1.set_ylim(0) - ax2.set_ylim(0) - ax1.set_xlim(0, len(ts)-1) - if plot_path is not None: - filename = f"{network_name}_node&edge_per_ts" - plt.savefig(f'{plot_path}/{filename}') - plt.show() - -def plot_for_snapshots(data: list, - filename: str, - y_title: str, - show_ave: bool=True, - plot_path:str = ".", - plot_title:str = None): - ''' - Plot a variable for different timestamps - Parameters: - data: A list of desired variable to be plotted - filename: Name of the output file name - y_title: Title of the y axis - show_ave: Whether to plot a line showing the average of the variable over all timestamps - plot_path: The path to save the output file - ''' - ts = list(range(0, len(data))) - # plt.rcParams["font.family"] = "Times New Roman" - fig = plt.figure(facecolor='w', figsize=(9,6)) - ax = fig.add_subplot(111) - ax.plot(ts, data, color='black', lw=3) - - ax.set_xlabel('Time', fontsize=20) - ax.set_ylabel(y_title, fontsize=20) - ax.tick_params(labelsize=20) - # ax.set_ylim(0, 7.5) - ax.set_xlim(0, len(ts)-1) - ax.set_title(plot_title, fontsize=20) - if show_ave: - ave_deg = [np.average(data) for i in range(len(ts))] - ax.plot(ts, ave_deg, color='#ca0020', linestyle='dashed', lw=3) - if plot_path is not None: - plt.savefig(f'{plot_path}/{filename}') - plt.show() - - -def plot_density_map(data, filename, y_title, plot_path=None): - ''' - Plot a density map using fig and ax - ''' - max_value = max(max(inner) for inner in data if inner) - c = np.zeros((max_value, len(data))) - - for i, row in enumerate(data): - for value in row: - c[value - 1][i] += 1 - - # Plot - fig = plt.figure(facecolor='w', figsize=(9, 6)) - ax = fig.add_subplot(111) - - norm = mcolors.Normalize(vmin=0, vmax=1) - cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm) - cbar = fig.colorbar(cax) - cbar.set_label('Frequency') - - ax.set_title("Heatmap of Node Degrees Over Time") - ax.set_xlabel('Time', fontsize=20) - ax.set_ylabel(y_title, fontsize=20) - ax.tick_params(labelsize=20) - ax.xaxis.set_major_locator(MaxNLocator(integer=True)) - - # Adjust the aspect ratio of the plot - ax.set_aspect('auto') - - if plot_path is not None: - plt.savefig(f'{plot_path}/{filename}') - plt.show() - -if __name__ == "__main__": - create_ts_list(86400, 86400*365, "unix", "month") +import datetime +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import matplotlib.colors as mcolors +from matplotlib.ticker import MaxNLocator + +def create_ts_list(start, end, metric=None, interval=None): + if metric == "Unix" or metric == "unix" or metric == "UNIX": + start = datetime.datetime.fromtimestamp(start).date() + end = datetime.datetime.fromtimestamp(end).date() + if interval == 'daily': + date_list = pd.date_range(start = start, end = end, freq="D") + elif interval == "month": + date_list = pd.date_range(start = start, end = end, freq="M") + elif interval == "year": + date_list = pd.date_range(start = start, end = end, freq="Y") + timelist = [] + for dates in date_list: + timelist.append(dates.strftime("%Y/%m/%d")) + else: + timelist = list(range(start, end, interval)) + # print(timelist) + return timelist + + + +def plot_nodes_edges_per_ts(edges: list, + nodes: list, + ts: list, + network_name: str, + plot_path: str = None, + ylabel_1: str = 'Edges per Timestamp', + ylabel_2: str = 'Nodes per Timestamp'): + """ + Plot nodes and edges per timestamp in one figure + Parameters: + edges: A list containing number of edges per timestamp + nodes: A list containing number of nodes per timestamp + ts: list of timestamps + network_name: Name of the network to be used in the output file name + plot_path: Path to save the output figure + ylabel_1: Label for the edges per timestamp line + ylabel_2: Label for the nodes per timestamp line + """ + fig = plt.figure(facecolor='w', figsize=(11, 6)) + ax1 = fig.add_subplot(111) + ax2 = ax1.twinx() + + c1, = ax1.plot(ts, edges, color='black', lw=3, label=ylabel_1) + c2, = ax2.plot(ts, nodes, color='gray', linestyle='dashed', lw=3, label=ylabel_2) + curves = [c1, c2] + ax1.legend(curves, [curve.get_label() for curve in curves], fontsize = 18) + ax1.set_xlabel('Time', fontsize=20) + ax1.set_ylabel(ylabel_1, fontsize=20) + ax2.set_ylabel(ylabel_2, fontsize=20) + ax1.tick_params(labelsize=20) + ax2.tick_params(labelsize=20) + ax1.set_ylim(0) + ax2.set_ylim(0) + ax1.set_xlim(0, len(ts)-1) + if plot_path is not None: + filename = f"{network_name}_node&edge_per_ts" + plt.savefig(f'{plot_path}/{filename}') + plt.show() + +def plot_for_snapshots(data: list, + y_title: str, + filename: str = None, + show_ave: bool=True, ): + ''' + Plot a variable for different timestamps + Parameters: + data: A list of desired variable to be plotted + filename: Name of the output file name + y_title: Title of the y axis + show_ave: Whether to plot a line showing the average of the variable over all timestamps + plot_path: The path to save the output file + ''' + ts = list(range(0, len(data))) + # plt.rcParams["font.family"] = "Times New Roman" + fig = plt.figure(facecolor='w', figsize=(9,6)) + ax = fig.add_subplot(111) + ax.plot(ts, data, color='black', lw=3) + + ax.set_xlabel('Time', fontsize=20) + ax.set_ylabel(y_title, fontsize=20) + ax.tick_params(labelsize=20) + ax.set_xlim(0, len(ts)-1) + if show_ave: + ave_deg = [np.average(data) for i in range(len(ts))] + ax.plot(ts, ave_deg, color='#ca0020', linestyle='dashed', lw=3) + if filename is not None: + plt.savefig(f'{filename}') + else: + plt.show() + + +def plot_density_map(data, filename, y_title, plot_path=None): + ''' + Plot a density map using fig and ax + ''' + max_value = max(max(inner) for inner in data if inner) + c = np.zeros((max_value, len(data))) + + for i, row in enumerate(data): + for value in row: + c[value - 1][i] += 1 + + # Plot + fig = plt.figure(facecolor='w', figsize=(9, 6)) + ax = fig.add_subplot(111) + + norm = mcolors.Normalize(vmin=0, vmax=1) + cax = ax.imshow(c, cmap='viridis', interpolation='nearest', norm=norm) + cbar = fig.colorbar(cax) + cbar.set_label('Frequency') + + ax.set_title("Heatmap of Node Degrees Over Time") + ax.set_xlabel('Time', fontsize=20) + ax.set_ylabel(y_title, fontsize=20) + ax.tick_params(labelsize=20) + ax.xaxis.set_major_locator(MaxNLocator(integer=True)) + + # Adjust the aspect ratio of the plot + ax.set_aspect('auto') + + if plot_path is not None: + plt.savefig(f'{plot_path}/{filename}') + plt.show() + +if __name__ == "__main__": + create_ts_list(86400, 86400*365, "unix", "month") create_ts_list(2015, 2022, interval=2) \ No newline at end of file diff --git a/tgx/utils/stat.py b/tgx/utils/stat.py index 864427d..a03783a 100644 --- a/tgx/utils/stat.py +++ b/tgx/utils/stat.py @@ -1,550 +1,550 @@ -from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map -import networkx as nx -import numpy as np -from tgx.utils.graph_utils import train_test_split -from typing import List, Dict - -__all__ = ["degree_over_time", - "nodes_over_time", - "edges_over_time", - "nodes_and_edges_over_time", - "get_avg_e_per_ts", - "get_avg_degree", - "get_num_timestamps", - "get_num_unique_edges", - "get_reoccurrence", - "get_surprise", - "get_novelty", - "get_avg_node_activity", - "get_avg_node_engagement", - "degree_density", - "connected_components_per_ts", - "size_connected_components", - "get_avg_node_engagement"] - - -def degree_over_time(graph: object, - network_name: str, - filepath: str = ".") -> None: - r''' - Plot average degree per timestamp. - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - total_nodes: number of nodes that appear through all the snapshots - network_name: name of the graph to be used in the output file name - filepath: path to save the output figure - ''' - print("Plotting average degree per timestamp") - ave_degree = _calculate_average_degree_per_ts(graph) - - if network_name is not None: - filename = f"{network_name}_ave_degree_per_ts" - else: - filename = "ave_degree_per_ts" - plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = filepath) - return - - - -def nodes_over_time(graph: object, - network_name: str, - filepath: str = ".") -> None: - - r''' - Plot number of active nodes per timestamp. - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - network_name: name of the graph to be used in the output file name - filepath: path to save the output figure - ''' - print("Plotting number of nodes per timestamp.") - active_nodes = _calculate_node_per_ts(graph) - if network_name is not None: - filename = f"{network_name}_nodes_per_ts" - else: - filename = "nodes_per_ts" - plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = filepath) - return - -def edges_over_time(graph: object, - plot_path: str = None, - network_name: str = None, - filepath: str = ".") -> None: - r''' - Plot number of edges per timestamp. - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - network_name: name of the graph to be used in the output file name - filepath: path to save the output figure - ''' - print("Plotting number of edges per timestamp.") - active_edges = _calculate_edge_per_ts(graph) - if network_name is not None: - filename = f"{network_name}_edges_per_ts" - else: - filename = "_edges_per_ts" - plot_for_snapshots(active_edges, plot_path, filename, "Number of edges", plot_path = filepath) - return - -def nodes_and_edges_over_time(graph: object, - network_name: str , - filepath: str = "."): - r""" - Plot number of nodes per timestamp and number of edges per timestamp in one fiugre. - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - network_name: name of the graph to be used in the output file name - filepath: path to save the output figure - """ - print("Plotting number of nodes and edges per timestamp.") - edges = _calculate_edge_per_ts(graph) - nodes = _calculate_node_per_ts(graph) - ts = list(range(0, len(graph.data))) - - - return plot_nodes_edges_per_ts(edges, nodes, ts, network_name = network_name, plot_path = filepath) - - - -def _calculate_average_degree_per_ts(graph): - total_nodes = graph.total_nodes() - total_ts = len(graph.data) - ave_degree = [] - for ts in range(total_ts): - num_edges = len(graph.data[ts]) - ave_degree.append(num_edges*2/ total_nodes) - return ave_degree - - -def _calculate_node_per_ts(graph): - active_nodes = [] - for ts in range(len(graph.data)): - active_nodes.append(graph.edgelist_node_count(graph.data[ts])) - return active_nodes - -def _calculate_edge_per_ts(graph): - active_edges = [] - for ts in range(len(graph.data)): - active_edges.append(len(graph.data[ts])) - return active_edges - -def get_avg_e_per_ts(graph_edgelist: dict) -> float: - r""" - Calculate the average number of edges per timestamp - - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - """ - sum_num_e_per_ts = 0 - unique_ts = list(graph_edgelist.keys()) - for ts in unique_ts: - num_e_at_this_ts = 0 - edge_at_this_ts = graph_edgelist[ts] - for e, repeat in edge_at_this_ts.items(): - num_e_at_this_ts += repeat - sum_num_e_per_ts += num_e_at_this_ts - avg_num_e_per_ts = (sum_num_e_per_ts * 1.0) / len(unique_ts) - - print(f"INFO: avg_num_e_per_ts: {avg_num_e_per_ts}") - return avg_num_e_per_ts - - -def get_avg_degree(graph: object) -> float: - r""" - Calculate average degree over the timestamps - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - """ - graph_edgelist = graph.data - degree_avg_at_ts_list = [] - unique_ts = list(graph_edgelist.keys()) - for ts in unique_ts: - e_at_this_ts = graph_edgelist[ts] - G = nx.MultiGraph() - for e, repeat in e_at_this_ts.items(): - G.add_edge(e[0], e[1], weight=repeat) - nodes = G.nodes() - degrees = [G.degree[n] for n in nodes] - degree_avg_at_ts_list.append(np.mean(degrees)) - - print(f"INFO: avg_degree: {np.mean(degree_avg_at_ts_list)}") - return np.mean(degree_avg_at_ts_list) - - -def get_num_timestamps(graph_edgelist:dict) -> int: - r""" - Calculate the number of timestamps - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - """ - print(f"INFO: Number of timestamps: {len(graph_edgelist)}") - return len(graph_edgelist) - -def get_num_unique_edges(graph: object) -> int: - r""" - Calculate the number of unique edges - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - """ - graph_edgelist = graph.data - unique_edges = {} - for ts, e_list in graph_edgelist.items(): - for e in e_list: - if e not in unique_edges: - unique_edges[e] = 1 - print(f"INFO: Number of unique edges: {len(unique_edges)}") - return len(unique_edges) - - -def _split_data_chronological(graph_edgelist: dict, test_ratio: int): - r""" - split the timestamped edge-list chronologically - """ - # split the temporal graph data chronologically - unique_ts = np.sort(list(graph_edgelist.keys())) - test_split_time = list(np.quantile(unique_ts, [1 - test_ratio]))[0] - - # make train-validation & test splits - train_val_e_set, test_e_set = {}, {} - for ts, e_list in graph_edgelist.items(): - for (u,v) in e_list: - - if ts < test_split_time: - if (u,v) not in train_val_e_set: - train_val_e_set[(u,v)] = 1 - else: - if (u,v) not in test_e_set: - test_e_set[(u,v)] = 1 - return train_val_e_set, test_e_set - -def find(x, parent): - if parent[x] == x: - return x - parent[x] = find(parent[x], parent) - return parent[x] - - -def merge(x, y, parent): - root_x = find(x, parent) - root_y = find(y, parent) - - if root_x != root_y: - parent[root_x] = root_y - -def get_reoccurrence(graph:object, test_ratio: float=0.15) -> float: - r""" - Calculate the recurrence index - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - test_ratio: The ratio to split the data chronologically - """ - graph_edgelist = graph.data - train_val_e_set, test_e_set = _split_data_chronological(graph_edgelist, test_ratio) - train_val_size = len(train_val_e_set) - # intersect = 0 - # total_train_freq = 0 - # for e, freq in train_val_e_set.items(): - # if freq > 1: - # print(e) - # total_train_freq += freq - # if e in test_e_set: - # intersect += freq - - # print(total_train_freq, intersect) - # reoccurrence = float(intersect * 1.0 / total_train_freq) - intersect = 0 - for e in test_e_set: - if e in train_val_e_set: - intersect += 1 - reoccurrence = float(intersect * 1.0 / train_val_size) - print(f"INFO: Reoccurrence: {reoccurrence}") - return reoccurrence - -def get_surprise(graph, test_ratio: float = 0.15) -> float: - r""" - Calculate the surprise index - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - test_ratio: The ratio to split the data chronologically - """ - graph_edgelist = graph.data - train_val_e_set, test_e_set = _split_data_chronological(graph_edgelist, test_ratio) - test_size = len(test_e_set) - - difference = 0 - # total_test_freq = 0 - # for e, freq in test_e_set.items(): - # total_test_freq += freq - # if e not in train_val_e_set: - # difference += freq - # surprise = float(difference * 1.0 / total_test_freq) - - for e in test_e_set: - if e not in train_val_e_set: - difference += 1 - surprise = float(difference * 1.0 / test_size) - print(f"INFO: Surprise: {surprise}") - return surprise - -def get_novelty(graph : object) -> float: - r""" - Calculate the novelty index - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - """ - graph_edgelist = graph.data - unique_ts = np.sort(list(graph_edgelist.keys())) - novelty_ts = [] - for ts_idx, ts in enumerate(unique_ts): - e_set_this_ts = set(list(graph_edgelist[ts])) - e_set_seen = [] - for idx in range(0, ts_idx): - e_set_seen.append(list(graph_edgelist[unique_ts[idx]])) - e_set_seen = set(item for sublist in e_set_seen for item in sublist) - novelty_ts.append(float(len(e_set_this_ts - e_set_seen) * 1.0 / len(e_set_this_ts))) - - novelty = float(np.sum(novelty_ts) * 1.0 / len(unique_ts)) - print(f"INFO: Novelty: {novelty}") - return novelty - - -def get_avg_node_activity(graph: object) -> float: - r""" - Calculate the average node activity, - the proportion of time steps a node is present - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - """ - graph_edgelist = graph.data - num_unique_ts = len(graph_edgelist) - node_ts = {} - for ts, e_list in graph_edgelist.items(): - for e in e_list: - # source - if e[0] not in node_ts: - node_ts[e[0]] = {ts: True} - else: - if ts not in node_ts[e[0]]: - node_ts[e[0]][ts] = True - - # destination - if e[1] not in node_ts: - node_ts[e[1]] = {ts: True} - else: - if ts not in node_ts[e[1]]: - node_ts[e[1]][ts] = True - - node_activity_ratio = [] - for n, ts_list in node_ts.items(): - node_activity_ratio.append(float(len(ts_list) * 1.0 / num_unique_ts)) - - avg_node_activity = float(np.sum(node_activity_ratio) * 1.0 / len(node_activity_ratio)) - print(f"INFO: Node activity ratio: {avg_node_activity}") - return avg_node_activity - - -#* new graph stats added -#TODO to not require k as input but get it from the Graph object -def degree_density(graph : object, k: int = 10, network_name: str = None, plot_path: str = None) -> None: - r""" - Plot density map of node degrees per time window - Parameters: - graph: Graph object created by tgx.Graph containing edgelist - k: number of time windows - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - graph_edgelist = graph.data - - degrees_by_k_list = [] - temp = [] - temp_idx = 0 - unique_ts = list(graph_edgelist.keys()) - for ts in unique_ts: - e_at_this_ts = graph_edgelist[ts] - G = nx.MultiGraph() - for e, repeat in e_at_this_ts.items(): - G.add_edge(e[0], e[1], weight=repeat) - nodes = G.nodes() - degrees = [G.degree[n] for n in nodes] - - if temp_idx None: - r""" - Plot number of connected components per timestamp - Parameters: - graph: a list containing graph snapshots - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - num_components = [] - for t in range(len(graph)): - parent = list(range(graph[t].number_of_nodes)) - - for _, edge_data in graph[t].edgelist.items(): - for (u, v), _ in edge_data.items(): - _merge(u, v, parent) - - num = 0 - for u in graph[t].nodes(): - if parent[u] == u: - num += 1 - num_components.append(num) - - if network_name is not None: - filename = f"{network_name}_connected_components_per_ts" - else: - filename = "_connected_components_per_ts" - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - print(num_components) - print("Plotting Done!") - - return - -#TODO to be fixed -def size_connected_components(graph: list) -> List[Dict]: - r""" - Calculate the sizes of connected components per timestamp - Returns: - list: A list containing the sizes of connected components in each timestamp. - """ - component_sizes = [] - for t in range(len(graph)): - parent = list(range(graph[t].number_of_nodes)) - - for _, edge_data in graph[t].edgelist.items(): - for (u, v), _ in edge_data.items(): - _merge(u, v, parent) - - component_sizes_t = {} - for u in graph[t].nodes(): - root = _find(u, parent) - if root not in component_sizes_t: - component_sizes_t[root] = 0 - component_sizes_t[root] += 1 - - component_sizes.append(component_sizes_t) - - return component_sizes - - -def get_avg_node_engagement(graph_edgelist: dict) -> List[int]: - r""" - Calculate the average node engagement per timestamp, - the average number of distinct nodes that establish - at least one new connection. - Parameters: - graph_edgelist: Dictionary containing graph data - """ - engaging_nodes = [] - previous_edges = set() - for ts, e_list in graph_edgelist.items(): - node_set = set() - new_edges = {(u, v) for (u, v), _ in e_list.items() if frozenset({u, v}) not in previous_edges} - for u, v in new_edges: - if u not in node_set: - node_set.add(u) - if v not in node_set: - node_set.add(v) - engaging_nodes.append(len(node_set)) - previous_edges = {frozenset({u, v}) for (u, v), _ in e_list.items()} # Update the set of previous edges for the next timestamp - return engaging_nodes - - - - -# def size_connected_components(graph) -> list: -# """ -# Calculate the sizes of connected components per timestamp. - -# Returns: -# component_sizes: A list containing the sizes of connected components in each timestamp. -# """ - -# component_sizes = [] -# for t in range(len(graph)): -# parent = list(range(graph[t].number_of_nodes)) - -# for _, edge_data in graph[t].edgelist.items(): -# for (u, v), _ in edge_data.items(): -# merge(u, v, parent) - -# component_sizes_t = {} -# for u in graph[t].nodes(): -# root = find(u, parent) -# if root not in component_sizes_t: -# component_sizes_t[root] = 0 -# component_sizes_t[root] += 1 - -# component_sizes.append(component_sizes_t) - -# return component_sizes - - -# def num_connected_components_per_ts(graph: list, -# network_name: str = None, -# plot_path: str = None) -> None: -# """ - -# Plot the number of connected components per timestamp. - -# """ - -# num_components = [] -# for t in range(len(graph)): -# parent = list(range(graph[t].number_of_nodes)) - -# for _, edge_data in graph[t].edgelist.items(): -# for (u, v), _ in edge_data.items(): -# merge(u, v, parent) - -# num = 0 -# for u in graph[t].nodes(): -# if parent[u] == u: -# num += 1 -# num_components.append(num) - -# if network_name is not None: -# filename = f"{network_name}_num_connected_components_per_ts" -# else: -# filename = "_num_connected_components_per_ts" -# plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) -# print(num_components) -# print("Plotting Done!") - -# return +from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map +import networkx as nx +import numpy as np +from tgx.utils.graph_utils import train_test_split +from typing import List, Dict + +__all__ = ["degree_over_time", + "nodes_over_time", + "edges_over_time", + "nodes_and_edges_over_time", + "get_avg_e_per_ts", + "get_avg_degree", + "get_num_timestamps", + "get_num_unique_edges", + "get_reoccurrence", + "get_surprise", + "get_novelty", + "get_avg_node_activity", + "get_avg_node_engagement", + "degree_density", + "connected_components_per_ts", + "size_connected_components", + "get_avg_node_engagement"] + + +def degree_over_time(graph: object, + network_name: str, + filepath: str = ".") -> None: + r''' + Plot average degree per timestamp. + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + total_nodes: number of nodes that appear through all the snapshots + network_name: name of the graph to be used in the output file name + filepath: path to save the output figure + ''' + print("Plotting average degree per timestamp") + ave_degree = _calculate_average_degree_per_ts(graph) + + if network_name is not None: + filename = f"{network_name}_ave_degree_per_ts" + else: + filename = "ave_degree_per_ts" + plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = filepath) + return + + + +def nodes_over_time(graph: object, + network_name: str, + filepath: str = ".") -> None: + + r''' + Plot number of active nodes per timestamp. + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + network_name: name of the graph to be used in the output file name + filepath: path to save the output figure + ''' + print("Plotting number of nodes per timestamp.") + active_nodes = _calculate_node_per_ts(graph) + if network_name is not None: + filename = f"{network_name}_nodes_per_ts" + else: + filename = "nodes_per_ts" + plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = filepath) + return + +def edges_over_time(graph: object, + plot_path: str = None, + network_name: str = None, + filepath: str = ".") -> None: + r''' + Plot number of edges per timestamp. + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + network_name: name of the graph to be used in the output file name + filepath: path to save the output figure + ''' + print("Plotting number of edges per timestamp.") + active_edges = _calculate_edge_per_ts(graph) + if network_name is not None: + filename = f"{network_name}_edges_per_ts" + else: + filename = "_edges_per_ts" + plot_for_snapshots(active_edges, plot_path, filename, "Number of edges", plot_path = filepath) + return + +def nodes_and_edges_over_time(graph: object, + network_name: str , + filepath: str = "."): + r""" + Plot number of nodes per timestamp and number of edges per timestamp in one fiugre. + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + network_name: name of the graph to be used in the output file name + filepath: path to save the output figure + """ + print("Plotting number of nodes and edges per timestamp.") + edges = _calculate_edge_per_ts(graph) + nodes = _calculate_node_per_ts(graph) + ts = list(range(0, len(graph.data))) + + + return plot_nodes_edges_per_ts(edges, nodes, ts, network_name = network_name, plot_path = filepath) + + + +def _calculate_average_degree_per_ts(graph): + total_nodes = graph.total_nodes() + total_ts = len(graph.data) + ave_degree = [] + for ts in range(total_ts): + num_edges = len(graph.data[ts]) + ave_degree.append(num_edges*2/ total_nodes) + return ave_degree + + +def _calculate_node_per_ts(graph): + active_nodes = [] + for ts in range(len(graph.data)): + active_nodes.append(graph.edgelist_node_count(graph.data[ts])) + return active_nodes + +def _calculate_edge_per_ts(graph): + active_edges = [] + for ts in range(len(graph.data)): + active_edges.append(len(graph.data[ts])) + return active_edges + +def get_avg_e_per_ts(graph_edgelist: dict) -> float: + r""" + Calculate the average number of edges per timestamp + + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + """ + sum_num_e_per_ts = 0 + unique_ts = list(graph_edgelist.keys()) + for ts in unique_ts: + num_e_at_this_ts = 0 + edge_at_this_ts = graph_edgelist[ts] + for e, repeat in edge_at_this_ts.items(): + num_e_at_this_ts += repeat + sum_num_e_per_ts += num_e_at_this_ts + avg_num_e_per_ts = (sum_num_e_per_ts * 1.0) / len(unique_ts) + + print(f"INFO: avg_num_e_per_ts: {avg_num_e_per_ts}") + return avg_num_e_per_ts + + +def get_avg_degree(graph: object) -> float: + r""" + Calculate average degree over the timestamps + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + """ + graph_edgelist = graph.data + degree_avg_at_ts_list = [] + unique_ts = list(graph_edgelist.keys()) + for ts in unique_ts: + e_at_this_ts = graph_edgelist[ts] + G = nx.MultiGraph() + for e, repeat in e_at_this_ts.items(): + G.add_edge(e[0], e[1], weight=repeat) + nodes = G.nodes() + degrees = [G.degree[n] for n in nodes] + degree_avg_at_ts_list.append(np.mean(degrees)) + + print(f"INFO: avg_degree: {np.mean(degree_avg_at_ts_list)}") + return np.mean(degree_avg_at_ts_list) + + +def get_num_timestamps(graph_edgelist:dict) -> int: + r""" + Calculate the number of timestamps + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + """ + print(f"INFO: Number of timestamps: {len(graph_edgelist)}") + return len(graph_edgelist) + +def get_num_unique_edges(graph: object) -> int: + r""" + Calculate the number of unique edges + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + """ + graph_edgelist = graph.data + unique_edges = {} + for ts, e_list in graph_edgelist.items(): + for e in e_list: + if e not in unique_edges: + unique_edges[e] = 1 + print(f"INFO: Number of unique edges: {len(unique_edges)}") + return len(unique_edges) + + +def _split_data_chronological(graph_edgelist: dict, test_ratio: int): + r""" + split the timestamped edge-list chronologically + """ + # split the temporal graph data chronologically + unique_ts = np.sort(list(graph_edgelist.keys())) + test_split_time = list(np.quantile(unique_ts, [1 - test_ratio]))[0] + + # make train-validation & test splits + train_val_e_set, test_e_set = {}, {} + for ts, e_list in graph_edgelist.items(): + for (u,v) in e_list: + + if ts < test_split_time: + if (u,v) not in train_val_e_set: + train_val_e_set[(u,v)] = 1 + else: + if (u,v) not in test_e_set: + test_e_set[(u,v)] = 1 + return train_val_e_set, test_e_set + +def find(x, parent): + if parent[x] == x: + return x + parent[x] = find(parent[x], parent) + return parent[x] + + +def merge(x, y, parent): + root_x = find(x, parent) + root_y = find(y, parent) + + if root_x != root_y: + parent[root_x] = root_y + +def get_reoccurrence(graph:object, test_ratio: float=0.15) -> float: + r""" + Calculate the recurrence index + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + test_ratio: The ratio to split the data chronologically + """ + graph_edgelist = graph.data + train_val_e_set, test_e_set = _split_data_chronological(graph_edgelist, test_ratio) + train_val_size = len(train_val_e_set) + # intersect = 0 + # total_train_freq = 0 + # for e, freq in train_val_e_set.items(): + # if freq > 1: + # print(e) + # total_train_freq += freq + # if e in test_e_set: + # intersect += freq + + # print(total_train_freq, intersect) + # reoccurrence = float(intersect * 1.0 / total_train_freq) + intersect = 0 + for e in test_e_set: + if e in train_val_e_set: + intersect += 1 + reoccurrence = float(intersect * 1.0 / train_val_size) + print(f"INFO: Reoccurrence: {reoccurrence}") + return reoccurrence + +def get_surprise(graph, test_ratio: float = 0.15) -> float: + r""" + Calculate the surprise index + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + test_ratio: The ratio to split the data chronologically + """ + graph_edgelist = graph.data + train_val_e_set, test_e_set = _split_data_chronological(graph_edgelist, test_ratio) + test_size = len(test_e_set) + + difference = 0 + # total_test_freq = 0 + # for e, freq in test_e_set.items(): + # total_test_freq += freq + # if e not in train_val_e_set: + # difference += freq + # surprise = float(difference * 1.0 / total_test_freq) + + for e in test_e_set: + if e not in train_val_e_set: + difference += 1 + surprise = float(difference * 1.0 / test_size) + print(f"INFO: Surprise: {surprise}") + return surprise + +def get_novelty(graph : object) -> float: + r""" + Calculate the novelty index + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + """ + graph_edgelist = graph.data + unique_ts = np.sort(list(graph_edgelist.keys())) + novelty_ts = [] + for ts_idx, ts in enumerate(unique_ts): + e_set_this_ts = set(list(graph_edgelist[ts])) + e_set_seen = [] + for idx in range(0, ts_idx): + e_set_seen.append(list(graph_edgelist[unique_ts[idx]])) + e_set_seen = set(item for sublist in e_set_seen for item in sublist) + novelty_ts.append(float(len(e_set_this_ts - e_set_seen) * 1.0 / len(e_set_this_ts))) + + novelty = float(np.sum(novelty_ts) * 1.0 / len(unique_ts)) + print(f"INFO: Novelty: {novelty}") + return novelty + + +def get_avg_node_activity(graph: object) -> float: + r""" + Calculate the average node activity, + the proportion of time steps a node is present + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + """ + graph_edgelist = graph.data + num_unique_ts = len(graph_edgelist) + node_ts = {} + for ts, e_list in graph_edgelist.items(): + for e in e_list: + # source + if e[0] not in node_ts: + node_ts[e[0]] = {ts: True} + else: + if ts not in node_ts[e[0]]: + node_ts[e[0]][ts] = True + + # destination + if e[1] not in node_ts: + node_ts[e[1]] = {ts: True} + else: + if ts not in node_ts[e[1]]: + node_ts[e[1]][ts] = True + + node_activity_ratio = [] + for n, ts_list in node_ts.items(): + node_activity_ratio.append(float(len(ts_list) * 1.0 / num_unique_ts)) + + avg_node_activity = float(np.sum(node_activity_ratio) * 1.0 / len(node_activity_ratio)) + print(f"INFO: Node activity ratio: {avg_node_activity}") + return avg_node_activity + + +#* new graph stats added +#TODO to not require k as input but get it from the Graph object +def degree_density(graph : object, k: int = 10, network_name: str = None, plot_path: str = None) -> None: + r""" + Plot density map of node degrees per time window + Parameters: + graph: Graph object created by tgx.Graph containing edgelist + k: number of time windows + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + graph_edgelist = graph.data + + degrees_by_k_list = [] + temp = [] + temp_idx = 0 + unique_ts = list(graph_edgelist.keys()) + for ts in unique_ts: + e_at_this_ts = graph_edgelist[ts] + G = nx.MultiGraph() + for e, repeat in e_at_this_ts.items(): + G.add_edge(e[0], e[1], weight=repeat) + nodes = G.nodes() + degrees = [G.degree[n] for n in nodes] + + if temp_idx None: + r""" + Plot number of connected components per timestamp + Parameters: + graph: a list containing graph snapshots + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure + """ + num_components = [] + for t in range(len(graph)): + parent = list(range(graph[t].number_of_nodes)) + + for _, edge_data in graph[t].edgelist.items(): + for (u, v), _ in edge_data.items(): + _merge(u, v, parent) + + num = 0 + for u in graph[t].nodes(): + if parent[u] == u: + num += 1 + num_components.append(num) + + if network_name is not None: + filename = f"{network_name}_connected_components_per_ts" + else: + filename = "_connected_components_per_ts" + plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) + print(num_components) + print("Plotting Done!") + + return + +#TODO to be fixed +def size_connected_components(graph: list) -> List[Dict]: + r""" + Calculate the sizes of connected components per timestamp + Returns: + list: A list containing the sizes of connected components in each timestamp. + """ + component_sizes = [] + for t in range(len(graph)): + parent = list(range(graph[t].number_of_nodes)) + + for _, edge_data in graph[t].edgelist.items(): + for (u, v), _ in edge_data.items(): + _merge(u, v, parent) + + component_sizes_t = {} + for u in graph[t].nodes(): + root = _find(u, parent) + if root not in component_sizes_t: + component_sizes_t[root] = 0 + component_sizes_t[root] += 1 + + component_sizes.append(component_sizes_t) + + return component_sizes + + +def get_avg_node_engagement(graph_edgelist: dict) -> List[int]: + r""" + Calculate the average node engagement per timestamp, + the average number of distinct nodes that establish + at least one new connection. + Parameters: + graph_edgelist: Dictionary containing graph data + """ + engaging_nodes = [] + previous_edges = set() + for ts, e_list in graph_edgelist.items(): + node_set = set() + new_edges = {(u, v) for (u, v), _ in e_list.items() if frozenset({u, v}) not in previous_edges} + for u, v in new_edges: + if u not in node_set: + node_set.add(u) + if v not in node_set: + node_set.add(v) + engaging_nodes.append(len(node_set)) + previous_edges = {frozenset({u, v}) for (u, v), _ in e_list.items()} # Update the set of previous edges for the next timestamp + return engaging_nodes + + + + +# def size_connected_components(graph) -> list: +# """ +# Calculate the sizes of connected components per timestamp. + +# Returns: +# component_sizes: A list containing the sizes of connected components in each timestamp. +# """ + +# component_sizes = [] +# for t in range(len(graph)): +# parent = list(range(graph[t].number_of_nodes)) + +# for _, edge_data in graph[t].edgelist.items(): +# for (u, v), _ in edge_data.items(): +# merge(u, v, parent) + +# component_sizes_t = {} +# for u in graph[t].nodes(): +# root = find(u, parent) +# if root not in component_sizes_t: +# component_sizes_t[root] = 0 +# component_sizes_t[root] += 1 + +# component_sizes.append(component_sizes_t) + +# return component_sizes + + +# def num_connected_components_per_ts(graph: list, +# network_name: str = None, +# plot_path: str = None) -> None: +# """ + +# Plot the number of connected components per timestamp. + +# """ + +# num_components = [] +# for t in range(len(graph)): +# parent = list(range(graph[t].number_of_nodes)) + +# for _, edge_data in graph[t].edgelist.items(): +# for (u, v), _ in edge_data.items(): +# merge(u, v, parent) + +# num = 0 +# for u in graph[t].nodes(): +# if parent[u] == u: +# num += 1 +# num_components.append(num) + +# if network_name is not None: +# filename = f"{network_name}_num_connected_components_per_ts" +# else: +# filename = "_num_connected_components_per_ts" +# plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) +# print(num_components) +# print("Plotting Done!") + +# return diff --git a/tgx/viz/TEA.py b/tgx/viz/TEA.py index e684f4d..db80703 100644 --- a/tgx/viz/TEA.py +++ b/tgx/viz/TEA.py @@ -1,219 +1,219 @@ -import pandas as pd -import matplotlib.pyplot as plt -from typing import Union, Optional -from tgx.utils.graph_utils import discretize_edges -from tgx.utils.plotting_utils import create_ts_list -__all__ = ["TEA"] - -def TEA( - temp_edgelist : Union[object, dict], - filepath : Optional[str] = ".", - fig_size : tuple = (7,5), - font_size : int = 20, - network_name : str = None, - time_scale : Union[str, int] = None, - real_dates : bool = None, - test_split : bool = False, - density : bool = False - ): - r""" - generating TEA plot - - Parameters: - temp_edgelist: a dictionary of temporal edges or a dataset object. - filepath: Path to save the TEA Plot. - fig_size: Size of the figure to save. - font_size: Size of the text in the figure. - network_name: Name of the dataset to be used in the TEA plot file. - time_scale: time_scale for discretizing data if already not done. - real_dates: Whether to use the real dates from dataset. - test_split: Whether show the test split on the plot. - density: Whether to return edge density and edge frequency dictioneries. - """ - if isinstance(temp_edgelist, object): - if temp_edgelist.freq_data is None: - temp_edgelist.count_freq() - temp_edgelist = temp_edgelist.freq_data - - # check number of unique timestamps: - unique_ts = list(temp_edgelist.keys()) - # if len(unique_ts) > max_time_scale: - # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() - # if inp == "y": - # temp_edgelist = edgelist_discritizer(temp_edgelist, - # unique_ts, - # time_scale = max_time_scale) - if time_scale is not None: - temp_edgelist = discretize_edges(temp_edgelist, - time_scale = time_scale) - - - ts_edges_dist, ts_edges_dist_density, edge_frequency_dict = TEA_process_edgelist_per_timestamp(temp_edgelist) - - TEA_plot_edges_bar(ts_edges_dist, - filepath = filepath, - fig_size = fig_size, - font_size = font_size, - network_name=network_name, - real_dates = real_dates, - test_split = test_split) - - if density: - return ts_edges_dist_density, edge_frequency_dict - - - -def TEA_process_edgelist_per_timestamp(temp_edgelist): - # generate distribution of the edges history - unique_ts = list(temp_edgelist.keys()) - # unique_ts.sort() - # print(f"There are {len(unique_ts)} timestamps.") - - # get node set & total number of nodes - node_dict = {} - for t, e_dict in temp_edgelist.items(): - for e, exist in e_dict.items(): - if e[0] not in node_dict: - node_dict[e[0]] = 1 - if e[1] not in node_dict: - node_dict[e[1]] = 1 - num_nodes = len(node_dict) - num_e_fully_connected = num_nodes * (num_nodes - 1) - - edge_frequency_dict = {} # how many times an edge is seen - ts_edges_dist = [] # contains different features specifying the characteristics of the edge distribution over time - ts_edges_dist_density = [] - for curr_t in unique_ts: - - # if curr_t < 2: - # print("curr_t", curr_t) - prev_ts = [ts for ts in unique_ts if ts < curr_t] - edges_in_prev_ts = {} - for bts in prev_ts: - edges_in_prev_ts.update(temp_edgelist[bts]) - - curr_ts_edge_list = temp_edgelist[curr_t] - for e in curr_ts_edge_list: - if e not in edge_frequency_dict: - edge_frequency_dict[e] = 1 - else: - edge_frequency_dict[e] += 1 - - if len(curr_ts_edge_list) > 0: - curr_ts_edges_dist = {'ts': curr_t, - 'new': len([e for e in curr_ts_edge_list if e not in edges_in_prev_ts]), - 'repeated': len([e for e in curr_ts_edge_list if e in edges_in_prev_ts]), - 'not_repeated': len([e for e in edges_in_prev_ts if e not in curr_ts_edge_list]), - 'total_curr_ts': len(curr_ts_edge_list), - 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) - } - curr_ts_edges_dist_density = {'ts': curr_t, - 'new': (curr_ts_edges_dist['new'] * 1.0) / num_e_fully_connected, - 'repeated': (curr_ts_edges_dist['repeated'] * 1.0) / num_e_fully_connected, - 'not_repeated': (curr_ts_edges_dist[ - 'not_repeated'] * 1.0) / num_e_fully_connected, - 'total_curr_ts': (curr_ts_edges_dist[ - 'total_curr_ts'] * 1.0) / num_e_fully_connected, - 'total_seen_until_curr_ts': (curr_ts_edges_dist[ - 'total_seen_until_curr_ts'] * 1.0) / num_e_fully_connected, - } - else: - curr_ts_edges_dist = {'ts': curr_t, - 'new': 0, - 'repeated': 0, - 'not_repeated': 0, - 'total_curr_ts': 0, - 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) - } - curr_ts_edges_dist_density = {'ts': curr_t, - 'new': 0, - 'repeated': 0, - 'not_repeated': 0, - 'total_curr_ts': 0, - 'total_seen_until_curr_ts': 0, - } - ts_edges_dist.append(curr_ts_edges_dist) - ts_edges_dist_density.append(curr_ts_edges_dist_density) - # print(len(edges_in_prev_ts)) - # print(len(ts_edges_dist)) - # print(edge_frequency_dict) - # break - return ts_edges_dist, ts_edges_dist_density, edge_frequency_dict - - -def TEA_plot_edges_bar(ts_edges_dist: list, - filepath: str = ".", - fig_size: list = (9,5), - font_size: int = 20, - network_name: str = None, - real_dates: list = None, - time_scale: list = None, - test_split: bool = False, - show: bool =False): - r""" - Making TEA plot and save into pdf file. - Args: - ts_edges_dist: list of dictionaries containing the edge distribution over time. - filepath: Path to save the TEA Plot. - fig_size: Size of the figure to save. - font_size: Size of the text in the figure. - network_name: Name of the dataset to be used in the TEA plot file. - real_dates: list of real dates as ticks - time_scale: time_scale for discretizing data if already not done. - test_split: Whether show the test split on the plot. - show: Whether to show the plot. - """ - - - ts_edges_dist_df = pd.DataFrame(ts_edges_dist, columns=['ts', 'new', 'repeated', - 'not_repeated', - 'total_curr_ts', - 'total_seen_until_curr_ts']) - - - ### Additional Stats ### - mean = ts_edges_dist_df.mean(axis=0) - # print("INFO: Network Name:", network_name) - # print("INFO: AVG. stats. over all timestamps: ", mean) - # print("INFO: ratio of avg.(new)/avg.(total_curr_ts): {:.2f}".format(mean['new'] / mean['total_curr_ts'])) - ### - - fig, ax = plt.subplots(figsize=fig_size) # lastfm, mooc, reddit, UNtrade, UNvote - plt.subplots_adjust(bottom=0.2, left=0.2) - font_size = font_size - ticks_font_size = 15 - plt.yticks(fontsize=ticks_font_size) - plt.xticks(fontsize=ticks_font_size) - if real_dates is not None: - start = real_dates[0] - end = real_dates[1] - metric = real_dates[2] - create_ts_list(start, end, metric=metric, interval=time_scale) - else: - duration = ts_edges_dist_df['ts'].tolist() - timestamps = [i for i in range(len(duration))] - - new = ts_edges_dist_df['new'].tolist() - repeated = ts_edges_dist_df['repeated'].tolist() - # print(len(timestamps), repeated, new) - # plotting stuffs - # bar plot - plt.bar(timestamps, repeated, label='Repeated', color='#404040', alpha=0.4) - plt.bar(timestamps, new, label='New', bottom=repeated, color='#ca0020', alpha=0.8, hatch='//') - # test split line - if test_split: - plt.axvline(x=(timestamps[int(0.85 * len(timestamps))]), color="blue", linestyle="--", linewidth=2) - plt.text((timestamps[int(0.85 * len(timestamps))]), 0, - 'x', va='center', ha='center', fontsize=font_size, fontweight='heavy', color='blue') - - plt.margins(x=0) - plt.xlabel("Timestamp", fontsize=font_size) - plt.ylabel("Number of edges", fontsize=font_size) - plt.legend(fontsize = 13) - if filepath is not None: - plt.savefig(f"{filepath}/{network_name}_TEA.pdf") - print("plot saved as " + f"{filepath}/{network_name}_TEA.pdf") - if (show): - plt.show() - - +import pandas as pd +import matplotlib.pyplot as plt +from typing import Union, Optional +from tgx.utils.graph_utils import discretize_edges +from tgx.utils.plotting_utils import create_ts_list +__all__ = ["TEA"] + +def TEA( + temp_edgelist : Union[object, dict], + filepath : Optional[str] = ".", + fig_size : tuple = (7,5), + font_size : int = 20, + network_name : str = None, + time_scale : Union[str, int] = None, + real_dates : bool = None, + test_split : bool = False, + density : bool = False + ): + r""" + generating TEA plot + + Parameters: + temp_edgelist: a dictionary of temporal edges or a dataset object. + filepath: Path to save the TEA Plot. + fig_size: Size of the figure to save. + font_size: Size of the text in the figure. + network_name: Name of the dataset to be used in the TEA plot file. + time_scale: time_scale for discretizing data if already not done. + real_dates: Whether to use the real dates from dataset. + test_split: Whether show the test split on the plot. + density: Whether to return edge density and edge frequency dictioneries. + """ + if isinstance(temp_edgelist, object): + if temp_edgelist.freq_data is None: + temp_edgelist.count_freq() + temp_edgelist = temp_edgelist.freq_data + + # check number of unique timestamps: + unique_ts = list(temp_edgelist.keys()) + # if len(unique_ts) > max_time_scale: + # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() + # if inp == "y": + # temp_edgelist = edgelist_discritizer(temp_edgelist, + # unique_ts, + # time_scale = max_time_scale) + if time_scale is not None: + temp_edgelist = discretize_edges(temp_edgelist, + time_scale = time_scale) + + + ts_edges_dist, ts_edges_dist_density, edge_frequency_dict = TEA_process_edgelist_per_timestamp(temp_edgelist) + + TEA_plot_edges_bar(ts_edges_dist, + filepath = filepath, + fig_size = fig_size, + font_size = font_size, + network_name=network_name, + real_dates = real_dates, + test_split = test_split) + + if density: + return ts_edges_dist_density, edge_frequency_dict + + + +def TEA_process_edgelist_per_timestamp(temp_edgelist): + # generate distribution of the edges history + unique_ts = list(temp_edgelist.keys()) + # unique_ts.sort() + # print(f"There are {len(unique_ts)} timestamps.") + + # get node set & total number of nodes + node_dict = {} + for t, e_dict in temp_edgelist.items(): + for e, exist in e_dict.items(): + if e[0] not in node_dict: + node_dict[e[0]] = 1 + if e[1] not in node_dict: + node_dict[e[1]] = 1 + num_nodes = len(node_dict) + num_e_fully_connected = num_nodes * (num_nodes - 1) + + edge_frequency_dict = {} # how many times an edge is seen + ts_edges_dist = [] # contains different features specifying the characteristics of the edge distribution over time + ts_edges_dist_density = [] + for curr_t in unique_ts: + + # if curr_t < 2: + # print("curr_t", curr_t) + prev_ts = [ts for ts in unique_ts if ts < curr_t] + edges_in_prev_ts = {} + for bts in prev_ts: + edges_in_prev_ts.update(temp_edgelist[bts]) + + curr_ts_edge_list = temp_edgelist[curr_t] + for e in curr_ts_edge_list: + if e not in edge_frequency_dict: + edge_frequency_dict[e] = 1 + else: + edge_frequency_dict[e] += 1 + + if len(curr_ts_edge_list) > 0: + curr_ts_edges_dist = {'ts': curr_t, + 'new': len([e for e in curr_ts_edge_list if e not in edges_in_prev_ts]), + 'repeated': len([e for e in curr_ts_edge_list if e in edges_in_prev_ts]), + 'not_repeated': len([e for e in edges_in_prev_ts if e not in curr_ts_edge_list]), + 'total_curr_ts': len(curr_ts_edge_list), + 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) + } + curr_ts_edges_dist_density = {'ts': curr_t, + 'new': (curr_ts_edges_dist['new'] * 1.0) / num_e_fully_connected, + 'repeated': (curr_ts_edges_dist['repeated'] * 1.0) / num_e_fully_connected, + 'not_repeated': (curr_ts_edges_dist[ + 'not_repeated'] * 1.0) / num_e_fully_connected, + 'total_curr_ts': (curr_ts_edges_dist[ + 'total_curr_ts'] * 1.0) / num_e_fully_connected, + 'total_seen_until_curr_ts': (curr_ts_edges_dist[ + 'total_seen_until_curr_ts'] * 1.0) / num_e_fully_connected, + } + else: + curr_ts_edges_dist = {'ts': curr_t, + 'new': 0, + 'repeated': 0, + 'not_repeated': 0, + 'total_curr_ts': 0, + 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) + } + curr_ts_edges_dist_density = {'ts': curr_t, + 'new': 0, + 'repeated': 0, + 'not_repeated': 0, + 'total_curr_ts': 0, + 'total_seen_until_curr_ts': 0, + } + ts_edges_dist.append(curr_ts_edges_dist) + ts_edges_dist_density.append(curr_ts_edges_dist_density) + # print(len(edges_in_prev_ts)) + # print(len(ts_edges_dist)) + # print(edge_frequency_dict) + # break + return ts_edges_dist, ts_edges_dist_density, edge_frequency_dict + + +def TEA_plot_edges_bar(ts_edges_dist: list, + filepath: str = ".", + fig_size: list = (9,5), + font_size: int = 20, + network_name: str = None, + real_dates: list = None, + time_scale: list = None, + test_split: bool = False, + show: bool =False): + r""" + Making TEA plot and save into pdf file. + Args: + ts_edges_dist: list of dictionaries containing the edge distribution over time. + filepath: Path to save the TEA Plot. + fig_size: Size of the figure to save. + font_size: Size of the text in the figure. + network_name: Name of the dataset to be used in the TEA plot file. + real_dates: list of real dates as ticks + time_scale: time_scale for discretizing data if already not done. + test_split: Whether show the test split on the plot. + show: Whether to show the plot. + """ + + + ts_edges_dist_df = pd.DataFrame(ts_edges_dist, columns=['ts', 'new', 'repeated', + 'not_repeated', + 'total_curr_ts', + 'total_seen_until_curr_ts']) + + + ### Additional Stats ### + mean = ts_edges_dist_df.mean(axis=0) + # print("INFO: Network Name:", network_name) + # print("INFO: AVG. stats. over all timestamps: ", mean) + # print("INFO: ratio of avg.(new)/avg.(total_curr_ts): {:.2f}".format(mean['new'] / mean['total_curr_ts'])) + ### + + fig, ax = plt.subplots(figsize=fig_size) # lastfm, mooc, reddit, UNtrade, UNvote + plt.subplots_adjust(bottom=0.2, left=0.2) + font_size = font_size + ticks_font_size = 15 + plt.yticks(fontsize=ticks_font_size) + plt.xticks(fontsize=ticks_font_size) + if real_dates is not None: + start = real_dates[0] + end = real_dates[1] + metric = real_dates[2] + create_ts_list(start, end, metric=metric, interval=time_scale) + else: + duration = ts_edges_dist_df['ts'].tolist() + timestamps = [i for i in range(len(duration))] + + new = ts_edges_dist_df['new'].tolist() + repeated = ts_edges_dist_df['repeated'].tolist() + # print(len(timestamps), repeated, new) + # plotting stuffs + # bar plot + plt.bar(timestamps, repeated, label='Repeated', color='#404040', alpha=0.4) + plt.bar(timestamps, new, label='New', bottom=repeated, color='#ca0020', alpha=0.8, hatch='//') + # test split line + if test_split: + plt.axvline(x=(timestamps[int(0.85 * len(timestamps))]), color="blue", linestyle="--", linewidth=2) + plt.text((timestamps[int(0.85 * len(timestamps))]), 0, + 'x', va='center', ha='center', fontsize=font_size, fontweight='heavy', color='blue') + + plt.margins(x=0) + plt.xlabel("Timestamp", fontsize=font_size) + plt.ylabel("Number of edges", fontsize=font_size) + plt.legend(fontsize = 13) + if filepath is not None: + plt.savefig(f"{filepath}/{network_name}_TEA.pdf") + print("plot saved as " + f"{filepath}/{network_name}_TEA.pdf") + if (show): + plt.show() + + diff --git a/tgx/viz/TET.py b/tgx/viz/TET.py index 52bb2a9..ba10c4f 100644 --- a/tgx/viz/TET.py +++ b/tgx/viz/TET.py @@ -1,336 +1,336 @@ -# TET Plot -import numpy as np -import pandas as pd -import seaborn as sns -from tqdm import tqdm -from typing import Union, Optional -import matplotlib.pyplot as plt -from tgx.utils.graph_utils import discretize_edges - - -# some parameters to be used for drawing -E_ABSENT = 0 -E_PRESENCE_GENERAL = 1 -E_SEEN_IN_TRAIN = 2 -E_IN_TEST = 3 -E_NOT_IN_TEST = 4 - -TEST_RATIO = 0.15 - -# new color controlling parameters; Date: Dec. 22, 2021 -E_ONLY_TRAIN = 10 -E_TRAIN_AND_TEST = 20 -E_TRANSDUCTIVE = 30 -E_INDUCTIVE = 40 - - -#! should be merged graph class? -def TET(temp_edgelist : Union[object, dict], - filepath: Optional[str] = ".", - time_scale : Union[str, int] = None, - network_name : str = None, - add_frame : bool = True, - test_split : bool = False, - figsize : tuple = (9, 5), - axis_title_font_size : int = 20, - ticks_font_size : int = 20, - show: bool = True): - r""" - Generate TET plots - Args: - temp_edgelist: a dictionary of temporal edges or a dataset object. - filepath: Path to save the TEA Plot. - figsize: Size of the figure to save. - axis_title_font_size: The font size of xis titles. - ticks_font_size: Size of the text in the figure. - add_frame: Add the frame to the plot. - network_name: Name of the dataset to be used in the TEA plot file. - time_scale: time_scale for discretizing data if already not done. - test_split: Whether show the test split on the plot. - max_time_scale: Maximum number of time_scale to discretize data. - show: Whether to show the plot. - """ - if isinstance(temp_edgelist, object): - if temp_edgelist.freq_data is None: - temp_edgelist.count_freq() - temp_edgelist = temp_edgelist.freq_data - - # check number of unique timestamps: - unique_ts = list(temp_edgelist.keys()) - # if len(unique_ts) > max_time_scale: - # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() - # if inp == "y": - # temp_edgelist = edgelist_discritizer(temp_edgelist, - # unique_ts, - # time_scale = max_time_scale) - if time_scale is not None: - temp_edgelist = discretize_edges(temp_edgelist, - time_scale = time_scale) - - edge_last_ts = generate_edge_last_timestamp(temp_edgelist) - edge_idx_map = generate_edge_idx_map(temp_edgelist, edge_last_ts) - idx_edge_map = {v: k for k, v in edge_idx_map.items()} # key: edge index; value: actual edge (source, destination) - print("Info: Number of distinct edges (from index-edge map): {}".format(len(idx_edge_map))) - - unique_ts_list = list(temp_edgelist.keys()) - e_presence_mat = generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, temp_edgelist) - print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) - # print(np.unique(e_presence_mat, return_counts=True)) - e_presence_mat, test_split_ts_value = process_presence_matrix(e_presence_mat, test_ratio_p=0.85) - print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) - # print(np.unique(e_presence_mat, return_counts=True)) - fig_param = set_fig_param(network_name, - fig_name = filepath, - figsize = figsize, - axis_title_font_size = axis_title_font_size, - ticks_font_size = ticks_font_size) - - plot_edge_presence_matrix(e_presence_mat, test_split_ts_value, unique_ts_list, list(idx_edge_map.keys()), - fig_param, test_split = test_split, add_frames=add_frame, show=show) - return - - -def generate_edge_last_timestamp(edges_per_ts): - """generates a dictionary containing the last timestamp of each edge""" - edge_last_ts = {} - for ts, e_list in edges_per_ts.items(): - for e in e_list: - if e not in edge_last_ts: - edge_last_ts[e] = ts - else: - edge_last_ts[e] = max(ts, edge_last_ts[e]) - return edge_last_ts - - -def generate_edge_idx_map(edges_per_ts, edge_last_ts): - """ - generates index for edges according to two-level sorting policy: - 1. the first level is based on their first appearance timestamp - 2. the second level is based on their last appearance timestamp - """ - edge_idx_map = {} # key: actual edge (source, destination), value: edge index - distinct_edge_idx = 0 - for ts, ts_e_list in edges_per_ts.items(): - e_last_ts_this_timestamp = {} - for e in ts_e_list: - e_last_ts_this_timestamp[e] = edge_last_ts[e] - e_last_ts_this_timestamp = dict(sorted(e_last_ts_this_timestamp.items(), key=lambda item: item[1])) - for e in e_last_ts_this_timestamp: - if e not in edge_idx_map: - edge_idx_map[e] = distinct_edge_idx - distinct_edge_idx += 1 - - return edge_idx_map - - -def generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, edges_per_ts): - ''' - Returns presence matrix with values 0 and 1 which indicate: - value = 0 : edge is not present in this timestamp - value = 1 : edge is present in this timestamp - - shape: (ts, total number of edges) - ''' - num_unique_ts = len(unique_ts_list) - num_unique_edge = len(idx_edge_map) - e_presence_mat = np.zeros([num_unique_ts, num_unique_edge], dtype=np.int8) - unique_ts_list = np.sort(unique_ts_list) - - for x, ts in tqdm(enumerate(unique_ts_list)): - es_ts = edges_per_ts[ts] - for e in es_ts: - e_presence_mat[num_unique_ts - x - 1, edge_idx_map[e]] = E_PRESENCE_GENERAL - - return e_presence_mat - -def process_presence_matrix(e_presence_matrix, test_ratio_p): - """ - there are 4 types of edge presence: - 1. only in train - 2. in train and in test - 3. in test and train (which is the number 2 but in later timestamps) - 4. only in test - X: timestamp - Y: edge index - """ - num_unique_ts = e_presence_matrix.shape[0] - num_unique_edges = e_presence_matrix.shape[1] - ts_idx_list = [i for i in range(num_unique_ts)] - - # generating timestamp list for train and test: - test_split_ts_value = int(np.quantile(ts_idx_list, test_ratio_p)) - train_ts_list = [ts for ts in ts_idx_list if ts <= test_split_ts_value] # any timestamp in train/validation split - test_ts_list = [ts for ts in ts_idx_list if ts > test_split_ts_value] # test_split_ts_value is in train - - # first level processing: differentiate train set edges: 1) Only in train set, 2) in train & test set - print("First level processing: ") - print("Detecting edges present in train & test sets") - for tr_ts in tqdm(train_ts_list): - for eidx in range(num_unique_edges): - if e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] == E_PRESENCE_GENERAL: - for test_ts_idx in range(test_split_ts_value + 1, num_unique_ts): - if e_presence_matrix[num_unique_ts - test_ts_idx - 1, eidx] == E_PRESENCE_GENERAL: # if seen in - # the test set - e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] = E_TRAIN_AND_TEST - break - - # differentiate test set edges: 1) transductive (seen in train, repeating in test), 2) inductive (only in test) - print("Detecting transductive edges (seen in train, repeating in test)") - for ts in tqdm(test_ts_list): - for eidx in range(num_unique_edges): - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - for prev_ts_idx in range(test_split_ts_value, -1, -1): - if e_presence_matrix[num_unique_ts - prev_ts_idx - 1, eidx] == E_TRAIN_AND_TEST: # if seen in - # the training set - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_TRANSDUCTIVE - break - - # second level processing - print("Second level processing:") - print("Detecting edges 1) Only in train set, 2) only in test (inductive)") - for ts in tqdm(range(num_unique_ts)): - for eidx in range(num_unique_edges): - if ts <= test_split_ts_value: - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_ONLY_TRAIN - else: - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_INDUCTIVE - - return e_presence_matrix, test_split_ts_value - - -def plot_edge_presence_matrix(e_presence_mat, - test_split_ts_value, - unique_ts_list, - idx_edge_list, - fig_param, - test_split = False, - add_frames=True, - show=False): - print("Info: plotting edge presence heatmap for {} ...".format(fig_param.fig_name)) - - fig, ax = plt.subplots(figsize=fig_param.figsize) - plt.subplots_adjust(bottom=0.3, left=0.2) - - # colors = ['white', # E_ABSENCE - # '#67a9cf', # E_ONLY_TRAIN - # '#ef8a62', # E_TRAIN_AND_TEST - # '#ef8a62', # E_TRANSDUCTIVE - # '#b2182b' # E_INDUCTIVE - # ] - if test_split: - colors = ['white', # E_ABSENCE - '#018571', # E_ONLY_TRAIN 2c7bb6 - '#fc8d59', # E_TRAIN_AND_TEST - '#fc8d59', # E_TRANSDUCTIVE - '#b2182b' # E_INDUCTIVE - ] - else: - colors = ['white', - '#ca0020', - '#ca0020', - '#ca0020', - '#ca0020',] - # print(sns.color_palette(colors, as_cmap=True)) - frame_color = "grey" # "#bababa" - time_split_color = "black" - axis_title_font_size = fig_param.axis_title_font_size - x_font_size = fig_param.ticks_font_size - y_font_size = fig_param.ticks_font_size - - ax = sns.heatmap(e_presence_mat, cmap=sns.color_palette(colors, as_cmap=True), cbar=False) - - # processing x-axis - x_gaps = np.linspace(0, len((idx_edge_list)), num=5) - x_labels = x_gaps / len(idx_edge_list) - x_labels = [int(100*x) for x in x_labels] - plt.xticks(x_gaps, x_labels, rotation=0, fontsize=x_font_size) - - # processing y-axis - t_gaps = np.linspace(0, len(unique_ts_list), num=5) - t_labels = [int(len(unique_ts_list) - tidx) for tidx in t_gaps] - plt.yticks(t_gaps, t_labels, rotation=90, fontsize=y_font_size) - - # axis & title - # plt.margins(x=0) - plt.xlabel("Percentage of observed edges", fontsize=axis_title_font_size) - plt.ylabel("Timestamp", fontsize=axis_title_font_size) - - # requirements for additional features - x_length = e_presence_mat.shape[1] - 1 - y_length = e_presence_mat.shape[0] - 1 - test_split_idx_value = y_length - test_split_ts_value - e_border_idx = 0 - for e_idx in range(e_presence_mat.shape[1] - 1, -1, -1): - if e_presence_mat[y_length - test_split_ts_value, e_idx] != E_ABSENT: - e_border_idx = e_idx - break - - # rectangle for different parts of the dataset - if add_frames and test_split: - print("Info: Border edge index:", e_border_idx) - print("Info: Test split timestamp value:", test_split_ts_value) - rect_train = plt.Rectangle((0, y_length - test_split_ts_value + 0.085), e_border_idx, test_split_ts_value + 0.9, - fill=False, linewidth=2, edgecolor=frame_color) - rect_test_mayseen = plt.Rectangle((0, 0), e_border_idx, y_length - test_split_ts_value - 0.1, - fill=False, linewidth=2, edgecolor=frame_color) - rect_test_new = plt.Rectangle((e_border_idx, 0), x_length - e_border_idx, - y_length - test_split_ts_value - 0.1, - fill=False, linewidth=2, edgecolor=frame_color) - ax = ax or plt.gca() - ax.add_patch(rect_train) - ax.add_patch(rect_test_mayseen) - ax.add_patch(rect_test_new) - - elif add_frames: - ax.add_patch(plt.Rectangle((0, 0), x_length, y_length+1, - fill=False, linewidth=2, edgecolor=frame_color)) - # test split horizontal line - if test_split: - plt.axhline(y=test_split_idx_value, color=time_split_color, linestyle="--", linewidth=2, label='x') - plt.text(x=0, y=test_split_idx_value, s='x', color=time_split_color, va='center', ha='center', - fontsize=y_font_size, fontweight='heavy') - - if fig_param.fig_name is not None: - # print("Info: file name: {}".format(fig_param.fig_name)) - plt.savefig(f"{fig_param.fig_name}/{fig_param.network_name}_TET.pdf") - plt.show() - print("Info: plotting done!") - -def set_fig_param(network_name, fig_name = None, - figsize = (9, 5), - axis_title_font_size = 20, - ticks_font_size = 22, - axis_tick_gap = 20, - timestamp_split_cross_mark_offset = 1): - - # if network_name in ['US Legislative', 'Canadian Vote', 'UN Trade', 'UN Vote']: - # axis_tick_gap = axis_tick_gap * 0.35 - - # elif network_name in ['Reddit', 'Wikipedia', 'UCI', 'Social Evo.', 'Flights', 'LastFM', 'MOOC']: - # axis_tick_gap = axis_tick_gap * 0.5 - - # elif network_name in ['Enron']: - # axis_tick_gap = axis_tick_gap * 0.4 - - fig_param = Fig_Param(network_name, - fig_name, - figsize, - axis_title_font_size, - ticks_font_size, - axis_tick_gap, - timestamp_split_cross_mark_offset) - - return fig_param - -class Fig_Param: - def __init__(self, network_name, fig_name, figsize, axis_title_font_size, ticks_font_size, axis_tick_gap, - timestamp_split_cross_mark_offset): - self.network_name = network_name - self.fig_name = fig_name - self.figsize = figsize - self.axis_title_font_size = axis_title_font_size - self.ticks_font_size = ticks_font_size - self.axis_tick_gap = axis_tick_gap +# TET Plot +import numpy as np +import pandas as pd +import seaborn as sns +from tqdm import tqdm +from typing import Union, Optional +import matplotlib.pyplot as plt +from tgx.utils.graph_utils import discretize_edges + + +# some parameters to be used for drawing +E_ABSENT = 0 +E_PRESENCE_GENERAL = 1 +E_SEEN_IN_TRAIN = 2 +E_IN_TEST = 3 +E_NOT_IN_TEST = 4 + +TEST_RATIO = 0.15 + +# new color controlling parameters; Date: Dec. 22, 2021 +E_ONLY_TRAIN = 10 +E_TRAIN_AND_TEST = 20 +E_TRANSDUCTIVE = 30 +E_INDUCTIVE = 40 + + +#! should be merged graph class? +def TET(temp_edgelist : Union[object, dict], + filepath: Optional[str] = ".", + time_scale : Union[str, int] = None, + network_name : str = None, + add_frame : bool = True, + test_split : bool = False, + figsize : tuple = (9, 5), + axis_title_font_size : int = 20, + ticks_font_size : int = 20, + show: bool = True): + r""" + Generate TET plots + Args: + temp_edgelist: a dictionary of temporal edges or a dataset object. + filepath: Path to save the TEA Plot. + figsize: Size of the figure to save. + axis_title_font_size: The font size of xis titles. + ticks_font_size: Size of the text in the figure. + add_frame: Add the frame to the plot. + network_name: Name of the dataset to be used in the TEA plot file. + time_scale: time_scale for discretizing data if already not done. + test_split: Whether show the test split on the plot. + max_time_scale: Maximum number of time_scale to discretize data. + show: Whether to show the plot. + """ + if isinstance(temp_edgelist, object): + if temp_edgelist.freq_data is None: + temp_edgelist.count_freq() + temp_edgelist = temp_edgelist.freq_data + + # check number of unique timestamps: + unique_ts = list(temp_edgelist.keys()) + # if len(unique_ts) > max_time_scale: + # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() + # if inp == "y": + # temp_edgelist = edgelist_discritizer(temp_edgelist, + # unique_ts, + # time_scale = max_time_scale) + if time_scale is not None: + temp_edgelist = discretize_edges(temp_edgelist, + time_scale = time_scale) + + edge_last_ts = generate_edge_last_timestamp(temp_edgelist) + edge_idx_map = generate_edge_idx_map(temp_edgelist, edge_last_ts) + idx_edge_map = {v: k for k, v in edge_idx_map.items()} # key: edge index; value: actual edge (source, destination) + print("Info: Number of distinct edges (from index-edge map): {}".format(len(idx_edge_map))) + + unique_ts_list = list(temp_edgelist.keys()) + e_presence_mat = generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, temp_edgelist) + print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) + # print(np.unique(e_presence_mat, return_counts=True)) + e_presence_mat, test_split_ts_value = process_presence_matrix(e_presence_mat, test_ratio_p=0.85) + print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) + # print(np.unique(e_presence_mat, return_counts=True)) + fig_param = set_fig_param(network_name, + fig_name = filepath, + figsize = figsize, + axis_title_font_size = axis_title_font_size, + ticks_font_size = ticks_font_size) + + plot_edge_presence_matrix(e_presence_mat, test_split_ts_value, unique_ts_list, list(idx_edge_map.keys()), + fig_param, test_split = test_split, add_frames=add_frame, show=show) + return + + +def generate_edge_last_timestamp(edges_per_ts): + """generates a dictionary containing the last timestamp of each edge""" + edge_last_ts = {} + for ts, e_list in edges_per_ts.items(): + for e in e_list: + if e not in edge_last_ts: + edge_last_ts[e] = ts + else: + edge_last_ts[e] = max(ts, edge_last_ts[e]) + return edge_last_ts + + +def generate_edge_idx_map(edges_per_ts, edge_last_ts): + """ + generates index for edges according to two-level sorting policy: + 1. the first level is based on their first appearance timestamp + 2. the second level is based on their last appearance timestamp + """ + edge_idx_map = {} # key: actual edge (source, destination), value: edge index + distinct_edge_idx = 0 + for ts, ts_e_list in edges_per_ts.items(): + e_last_ts_this_timestamp = {} + for e in ts_e_list: + e_last_ts_this_timestamp[e] = edge_last_ts[e] + e_last_ts_this_timestamp = dict(sorted(e_last_ts_this_timestamp.items(), key=lambda item: item[1])) + for e in e_last_ts_this_timestamp: + if e not in edge_idx_map: + edge_idx_map[e] = distinct_edge_idx + distinct_edge_idx += 1 + + return edge_idx_map + + +def generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, edges_per_ts): + ''' + Returns presence matrix with values 0 and 1 which indicate: + value = 0 : edge is not present in this timestamp + value = 1 : edge is present in this timestamp + + shape: (ts, total number of edges) + ''' + num_unique_ts = len(unique_ts_list) + num_unique_edge = len(idx_edge_map) + e_presence_mat = np.zeros([num_unique_ts, num_unique_edge], dtype=np.int8) + unique_ts_list = np.sort(unique_ts_list) + + for x, ts in tqdm(enumerate(unique_ts_list)): + es_ts = edges_per_ts[ts] + for e in es_ts: + e_presence_mat[num_unique_ts - x - 1, edge_idx_map[e]] = E_PRESENCE_GENERAL + + return e_presence_mat + +def process_presence_matrix(e_presence_matrix, test_ratio_p): + """ + there are 4 types of edge presence: + 1. only in train + 2. in train and in test + 3. in test and train (which is the number 2 but in later timestamps) + 4. only in test + X: timestamp + Y: edge index + """ + num_unique_ts = e_presence_matrix.shape[0] + num_unique_edges = e_presence_matrix.shape[1] + ts_idx_list = [i for i in range(num_unique_ts)] + + # generating timestamp list for train and test: + test_split_ts_value = int(np.quantile(ts_idx_list, test_ratio_p)) + train_ts_list = [ts for ts in ts_idx_list if ts <= test_split_ts_value] # any timestamp in train/validation split + test_ts_list = [ts for ts in ts_idx_list if ts > test_split_ts_value] # test_split_ts_value is in train + + # first level processing: differentiate train set edges: 1) Only in train set, 2) in train & test set + print("First level processing: ") + print("Detecting edges present in train & test sets") + for tr_ts in tqdm(train_ts_list): + for eidx in range(num_unique_edges): + if e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] == E_PRESENCE_GENERAL: + for test_ts_idx in range(test_split_ts_value + 1, num_unique_ts): + if e_presence_matrix[num_unique_ts - test_ts_idx - 1, eidx] == E_PRESENCE_GENERAL: # if seen in + # the test set + e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] = E_TRAIN_AND_TEST + break + + # differentiate test set edges: 1) transductive (seen in train, repeating in test), 2) inductive (only in test) + print("Detecting transductive edges (seen in train, repeating in test)") + for ts in tqdm(test_ts_list): + for eidx in range(num_unique_edges): + if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: + for prev_ts_idx in range(test_split_ts_value, -1, -1): + if e_presence_matrix[num_unique_ts - prev_ts_idx - 1, eidx] == E_TRAIN_AND_TEST: # if seen in + # the training set + e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_TRANSDUCTIVE + break + + # second level processing + print("Second level processing:") + print("Detecting edges 1) Only in train set, 2) only in test (inductive)") + for ts in tqdm(range(num_unique_ts)): + for eidx in range(num_unique_edges): + if ts <= test_split_ts_value: + if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: + e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_ONLY_TRAIN + else: + if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: + e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_INDUCTIVE + + return e_presence_matrix, test_split_ts_value + + +def plot_edge_presence_matrix(e_presence_mat, + test_split_ts_value, + unique_ts_list, + idx_edge_list, + fig_param, + test_split = False, + add_frames=True, + show=False): + print("Info: plotting edge presence heatmap for {} ...".format(fig_param.fig_name)) + + fig, ax = plt.subplots(figsize=fig_param.figsize) + plt.subplots_adjust(bottom=0.3, left=0.2) + + # colors = ['white', # E_ABSENCE + # '#67a9cf', # E_ONLY_TRAIN + # '#ef8a62', # E_TRAIN_AND_TEST + # '#ef8a62', # E_TRANSDUCTIVE + # '#b2182b' # E_INDUCTIVE + # ] + if test_split: + colors = ['white', # E_ABSENCE + '#018571', # E_ONLY_TRAIN 2c7bb6 + '#fc8d59', # E_TRAIN_AND_TEST + '#fc8d59', # E_TRANSDUCTIVE + '#b2182b' # E_INDUCTIVE + ] + else: + colors = ['white', + '#ca0020', + '#ca0020', + '#ca0020', + '#ca0020',] + # print(sns.color_palette(colors, as_cmap=True)) + frame_color = "grey" # "#bababa" + time_split_color = "black" + axis_title_font_size = fig_param.axis_title_font_size + x_font_size = fig_param.ticks_font_size + y_font_size = fig_param.ticks_font_size + + ax = sns.heatmap(e_presence_mat, cmap=sns.color_palette(colors, as_cmap=True), cbar=False) + + # processing x-axis + x_gaps = np.linspace(0, len((idx_edge_list)), num=5) + x_labels = x_gaps / len(idx_edge_list) + x_labels = [int(100*x) for x in x_labels] + plt.xticks(x_gaps, x_labels, rotation=0, fontsize=x_font_size) + + # processing y-axis + t_gaps = np.linspace(0, len(unique_ts_list), num=5) + t_labels = [int(len(unique_ts_list) - tidx) for tidx in t_gaps] + plt.yticks(t_gaps, t_labels, rotation=90, fontsize=y_font_size) + + # axis & title + # plt.margins(x=0) + plt.xlabel("Percentage of observed edges", fontsize=axis_title_font_size) + plt.ylabel("Timestamp", fontsize=axis_title_font_size) + + # requirements for additional features + x_length = e_presence_mat.shape[1] - 1 + y_length = e_presence_mat.shape[0] - 1 + test_split_idx_value = y_length - test_split_ts_value + e_border_idx = 0 + for e_idx in range(e_presence_mat.shape[1] - 1, -1, -1): + if e_presence_mat[y_length - test_split_ts_value, e_idx] != E_ABSENT: + e_border_idx = e_idx + break + + # rectangle for different parts of the dataset + if add_frames and test_split: + print("Info: Border edge index:", e_border_idx) + print("Info: Test split timestamp value:", test_split_ts_value) + rect_train = plt.Rectangle((0, y_length - test_split_ts_value + 0.085), e_border_idx, test_split_ts_value + 0.9, + fill=False, linewidth=2, edgecolor=frame_color) + rect_test_mayseen = plt.Rectangle((0, 0), e_border_idx, y_length - test_split_ts_value - 0.1, + fill=False, linewidth=2, edgecolor=frame_color) + rect_test_new = plt.Rectangle((e_border_idx, 0), x_length - e_border_idx, + y_length - test_split_ts_value - 0.1, + fill=False, linewidth=2, edgecolor=frame_color) + ax = ax or plt.gca() + ax.add_patch(rect_train) + ax.add_patch(rect_test_mayseen) + ax.add_patch(rect_test_new) + + elif add_frames: + ax.add_patch(plt.Rectangle((0, 0), x_length, y_length+1, + fill=False, linewidth=2, edgecolor=frame_color)) + # test split horizontal line + if test_split: + plt.axhline(y=test_split_idx_value, color=time_split_color, linestyle="--", linewidth=2, label='x') + plt.text(x=0, y=test_split_idx_value, s='x', color=time_split_color, va='center', ha='center', + fontsize=y_font_size, fontweight='heavy') + + if fig_param.fig_name is not None: + # print("Info: file name: {}".format(fig_param.fig_name)) + plt.savefig(f"{fig_param.fig_name}/{fig_param.network_name}_TET.pdf") + plt.show() + print("Info: plotting done!") + +def set_fig_param(network_name, fig_name = None, + figsize = (9, 5), + axis_title_font_size = 20, + ticks_font_size = 22, + axis_tick_gap = 20, + timestamp_split_cross_mark_offset = 1): + + # if network_name in ['US Legislative', 'Canadian Vote', 'UN Trade', 'UN Vote']: + # axis_tick_gap = axis_tick_gap * 0.35 + + # elif network_name in ['Reddit', 'Wikipedia', 'UCI', 'Social Evo.', 'Flights', 'LastFM', 'MOOC']: + # axis_tick_gap = axis_tick_gap * 0.5 + + # elif network_name in ['Enron']: + # axis_tick_gap = axis_tick_gap * 0.4 + + fig_param = Fig_Param(network_name, + fig_name, + figsize, + axis_title_font_size, + ticks_font_size, + axis_tick_gap, + timestamp_split_cross_mark_offset) + + return fig_param + +class Fig_Param: + def __init__(self, network_name, fig_name, figsize, axis_title_font_size, ticks_font_size, axis_tick_gap, + timestamp_split_cross_mark_offset): + self.network_name = network_name + self.fig_name = fig_name + self.figsize = figsize + self.axis_title_font_size = axis_title_font_size + self.ticks_font_size = ticks_font_size + self.axis_tick_gap = axis_tick_gap self.timestamp_split_cross_mark_offset = timestamp_split_cross_mark_offset \ No newline at end of file From 38c96ef5d1d7410de5c81a4410c789a16d8a283e Mon Sep 17 00:00:00 2001 From: shenyangHuang Date: Sun, 21 Jan 2024 12:04:08 -0500 Subject: [PATCH 6/7] updating example --- examples/data_viz.py | 51 +++++++++-------- tgx/utils/stat.py | 130 +++++++++++++++++++++++-------------------- 2 files changed, 96 insertions(+), 85 deletions(-) diff --git a/examples/data_viz.py b/examples/data_viz.py index d74abe7..c4045d6 100644 --- a/examples/data_viz.py +++ b/examples/data_viz.py @@ -1,40 +1,38 @@ import tgx -from tgx.utils.graph_utils import list2csv +from tgx.utils.plotting_utils import plot_for_snapshots """ -1. load a dataset -2. load into a graph -3. discretize the graph -4. save the graph back to a csv +master example to show all visualization in tgx """ -#! load the datasets -# dataset = tgx.builtin.uci() #built in datasets +#* load built in datasets +dataset = tgx.builtin.uci() #built in datasets -data_name = "tgbl-wiki" #"tgbl-review" -dataset = tgx.tgb_data(data_name) #tgb datasets +#* load the tgb datasets +# data_name = "tgbl-wiki" #"tgbl-review" +# dataset = tgx.tgb_data(data_name) #tgb datasets ctdg = tgx.Graph(dataset) -time_scale = "daily" +time_scale = "weekly" #"daily" dtdg = ctdg.discretize(time_scale=time_scale)[0] #! plotting the statistics, works -tgx.degree_over_time(dtdg, network_name=data_name) -tgx.nodes_over_time(dtdg, network_name=data_name) -tgx.edges_over_time(dtdg, network_name=data_name) -tgx.nodes_and_edges_over_time(dtdg, network_name=data_name) +tgx.degree_over_time(dtdg, network_name=dataset.name) +tgx.nodes_over_time(dtdg, network_name=dataset.name) +tgx.edges_over_time(dtdg, network_name=dataset.name) +tgx.nodes_and_edges_over_time(dtdg, network_name=dataset.name) tgx.TET(dtdg, - network_name=data_name, + network_name=dataset.name, figsize = (9, 5), axis_title_font_size = 24, ticks_font_size = 24) tgx.TEA(dtdg, - network_name=data_name) + network_name=dataset.name) @@ -42,18 +40,23 @@ test_ratio = 0.15 tgx.get_reoccurrence(ctdg, test_ratio=test_ratio) tgx.get_surprise(ctdg, test_ratio=test_ratio) - -#* these two much faster on dtdgs tgx.get_avg_node_activity(dtdg) tgx.get_novelty(dtdg) +# Number of Connected Components +tgx.connected_components_per_ts(dtdg, network_name=dataset.name, plot_path = plot_path) +# Size of Largest Connected Component +component_sizes = tgx.size_connected_components(dtdg) +largest_component_sizes = [max(inner_list) if inner_list else 0 for inner_list in component_sizes] +filename = f"{dataset.name}_largest_connected_component_size" +plot_for_snapshots(largest_component_sizes, filename, "Size of Largest Connected Component", plot_path = plot_path) +# Average Node Engagement +engagements = tgx.get_avg_node_engagement(dtdg) +filename = f"{dataset.name}_average_node_engagement" +plot_for_snapshots(engagements, filename, "Average Engagement", plot_path = plot_path) -# #! statistics to be updated and fixed -# #TODO -# tgx.degree_density() -# tgx.connected_components_per_ts() -# tgx.size_connected_components() -# tgx.get_avg_node_engagement() \ No newline at end of file +# Degree Density +tgx.degree_density(dtdg, k=3, network_name=dataset.name, plot_path = plot_path) \ No newline at end of file diff --git a/tgx/utils/stat.py b/tgx/utils/stat.py index a03783a..ac97cd6 100644 --- a/tgx/utils/stat.py +++ b/tgx/utils/stat.py @@ -1,7 +1,6 @@ from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map import networkx as nx import numpy as np -from tgx.utils.graph_utils import train_test_split from typing import List, Dict __all__ = ["degree_over_time", @@ -22,6 +21,21 @@ "size_connected_components", "get_avg_node_engagement"] +#* helper functions +def _find(x, parent): + if parent[x] == x: + return x + parent[x] = _find(parent[x], parent) + return parent[x] + + +def _merge(x, y, parent): + root_x = _find(x, parent) + root_y = _find(y, parent) + + if root_x != root_y: + parent[root_x] = root_y + def degree_over_time(graph: object, network_name: str, @@ -343,28 +357,29 @@ def get_avg_node_activity(graph: object) -> float: return avg_node_activity -#* new graph stats added -#TODO to not require k as input but get it from the Graph object -def degree_density(graph : object, k: int = 10, network_name: str = None, plot_path: str = None) -> None: + +def degree_density(graph: tuple, k: int = 10, network_name: str = None, plot_path: str = None) -> None: r""" Plot density map of node degrees per time window Parameters: - graph: Graph object created by tgx.Graph containing edgelist + graph_edgelist: Dictionary containing graph data k: number of time windows network_name: name of the graph to be used in the output file name plot_path: path to save the output figure """ graph_edgelist = graph.data - degrees_by_k_list = [] temp = [] temp_idx = 0 unique_ts = list(graph_edgelist.keys()) + for ts in unique_ts: e_at_this_ts = graph_edgelist[ts] G = nx.MultiGraph() - for e, repeat in e_at_this_ts.items(): - G.add_edge(e[0], e[1], weight=repeat) + + for e in e_at_this_ts: + G.add_edge(e[0], e[1]) + nodes = G.nodes() degrees = [G.degree[n] for n in nodes] @@ -375,34 +390,20 @@ def degree_density(graph : object, k: int = 10, network_name: str = None, plot_p degrees_by_k_list.append(temp) temp = degrees temp_idx = 1 + if temp: degrees_by_k_list.append(temp) if network_name is not None: - filename = f"{network_name}_get_degree_density" + filename = f"{network_name}_degree_density" else: - filename = "_get_degree_density" + filename = "_degree_density" + plot_density_map(degrees_by_k_list, filename, "Node Degree", plot_path = plot_path) - print("Plotting Done!") return -def _find(x, parent): - if parent[x] == x: - return x - parent[x] = _find(parent[x], parent) - return parent[x] - - -def _merge(x, y, parent): - root_x = _find(x, parent) - root_y = _find(y, parent) - - if root_x != root_y: - parent[root_x] = root_y - -#TODO to be fixed -def connected_components_per_ts(graph: list, +def connected_components_per_ts(graph: tuple, network_name: str = None, plot_path: str = None) -> None: r""" @@ -413,57 +414,60 @@ def connected_components_per_ts(graph: list, plot_path: path to save the output figure """ num_components = [] - for t in range(len(graph)): - parent = list(range(graph[t].number_of_nodes)) + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} - for _, edge_data in graph[t].edgelist.items(): - for (u, v), _ in edge_data.items(): - _merge(u, v, parent) + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) num = 0 - for u in graph[t].nodes(): + for u in nodes_t: if parent[u] == u: - num += 1 - num_components.append(num) + num += 1 + num_components.append(num) if network_name is not None: filename = f"{network_name}_connected_components_per_ts" else: filename = "_connected_components_per_ts" - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - print(num_components) - print("Plotting Done!") + plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) return -#TODO to be fixed -def size_connected_components(graph: list) -> List[Dict]: + +def size_connected_components(graph: tuple) -> List[List]: r""" Calculate the sizes of connected components per timestamp Returns: - list: A list containing the sizes of connected components in each timestamp. + list[list]: A list containing lists of sizes of connected components for each timestamp. """ component_sizes = [] - for t in range(len(graph)): - parent = list(range(graph[t].number_of_nodes)) + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} - for _, edge_data in graph[t].edgelist.items(): - for (u, v), _ in edge_data.items(): - _merge(u, v, parent) + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) component_sizes_t = {} - for u in graph[t].nodes(): + for u in nodes_t: root = _find(u, parent) if root not in component_sizes_t: component_sizes_t[root] = 0 component_sizes_t[root] += 1 - - component_sizes.append(component_sizes_t) + + component_sizes_t_list = list(component_sizes_t.values()) + component_sizes.append(component_sizes_t_list) return component_sizes -def get_avg_node_engagement(graph_edgelist: dict) -> List[int]: +def get_avg_node_engagement(graph: tuple) -> List[int]: r""" Calculate the average node engagement per timestamp, the average number of distinct nodes that establish @@ -473,19 +477,23 @@ def get_avg_node_engagement(graph_edgelist: dict) -> List[int]: """ engaging_nodes = [] previous_edges = set() - for ts, e_list in graph_edgelist.items(): - node_set = set() - new_edges = {(u, v) for (u, v), _ in e_list.items() if frozenset({u, v}) not in previous_edges} - for u, v in new_edges: - if u not in node_set: - node_set.add(u) - if v not in node_set: - node_set.add(v) - engaging_nodes.append(len(node_set)) - previous_edges = {frozenset({u, v}) for (u, v), _ in e_list.items()} # Update the set of previous edges for the next timestamp - return engaging_nodes + for ts in range(len(graph.data)): + edgelist_t = graph.data[ts] + new_nodes = set() + + for edge in edgelist_t: + (u, v) = edge + if frozenset({u, v}) not in previous_edges: + if u not in new_nodes: + new_nodes.add(u) + if v not in new_nodes: + new_nodes.add(v) + + engaging_nodes.append(len(new_nodes)) + previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp + return engaging_nodes # def size_connected_components(graph) -> list: From c027b86657e419807d44224cdea17ebfced3d3a0 Mon Sep 17 00:00:00 2001 From: shenyangHuang Date: Sun, 21 Jan 2024 12:38:49 -0500 Subject: [PATCH 7/7] ready to merge --- docs/tutorials/Features.ipynb | 4 +- docs/tutorials/data_loader.ipynb | 2 +- examples/data_viz.py | 13 +- examples/newtest.py | 27 -- examples/test_package2.py | 20 -- tgx/utils/graph_utils.py | 406 +++++-------------------------- tgx/utils/newstat.py | 163 ------------- tgx/utils/plotting_utils.py | 32 +-- tgx/utils/stat.py | 112 +++++++-- tgx/viz/TEA.py | 222 ----------------- tgx/viz/TET.py | 338 ------------------------- 11 files changed, 174 insertions(+), 1165 deletions(-) delete mode 100644 examples/newtest.py delete mode 100644 examples/test_package2.py delete mode 100644 tgx/utils/newstat.py diff --git a/docs/tutorials/Features.ipynb b/docs/tutorials/Features.ipynb index 97bb18b..b0ec753 100644 --- a/docs/tutorials/Features.ipynb +++ b/docs/tutorials/Features.ipynb @@ -423,7 +423,7 @@ "outputs": [], "source": [ "node_engagement = get_avg_node_engagement(data)\n", - "plot_for_snapshots(node_engagement, filename=f\"{dataset.name}_avg_node_engagement_per_ts\", y_title=\"node engagement\")" + "plot_for_snapshots(node_engagement, y_title=\"node engagement\", filename=\"./\" + f\"{dataset.name}_avg_node_engagement_per_ts\")" ] }, { @@ -447,7 +447,7 @@ "outputs": [], "source": [ "node_activity = get_avg_node_activity(data)\n", - "plot_for_snapshots(node_activity, filename=f\"{dataset.name}_avg_node_activity_per_ts\", y_title=\"node activity\")" + "plot_for_snapshots(node_activity, y_title=\"node activity\", filename=\"./\" + f\"{dataset.name}_avg_node_activity_per_ts\")" ] } ], diff --git a/docs/tutorials/data_loader.ipynb b/docs/tutorials/data_loader.ipynb index f2bd485..1975875 100644 --- a/docs/tutorials/data_loader.ipynb +++ b/docs/tutorials/data_loader.ipynb @@ -256,7 +256,7 @@ } ], "source": [ - "from tgx.utils.graph_stat import get_avg_node_engagement\n", + "from tgx.utils.stat import get_avg_node_engagement\n", "\n", "node_engagement = get_avg_node_engagement(data)\n", "filename = f\"{dataset.name}_ave_node_engagement_per_ts\"\n", diff --git a/examples/data_viz.py b/examples/data_viz.py index 726e416..44e30aa 100644 --- a/examples/data_viz.py +++ b/examples/data_viz.py @@ -18,7 +18,7 @@ dtdg = ctdg.discretize(time_scale=time_scale)[0] -#! plotting the statistics, works +#* plotting the statistics tgx.degree_over_time(dtdg, network_name=dataset.name) tgx.nodes_over_time(dtdg, network_name=dataset.name) tgx.edges_over_time(dtdg, network_name=dataset.name) @@ -36,27 +36,26 @@ -#! compute statistics +#* compute statistics test_ratio = 0.15 tgx.get_reoccurrence(ctdg, test_ratio=test_ratio) tgx.get_surprise(ctdg, test_ratio=test_ratio) -tgx.get_avg_node_activity(dtdg) tgx.get_novelty(dtdg) # Number of Connected Components -tgx.connected_components_per_ts(dtdg, network_name=dataset.name, plot_path = plot_path) +tgx.connected_components_per_ts(dtdg, network_name=dataset.name) # Size of Largest Connected Component component_sizes = tgx.size_connected_components(dtdg) largest_component_sizes = [max(inner_list) if inner_list else 0 for inner_list in component_sizes] filename = f"{dataset.name}_largest_connected_component_size" -plot_for_snapshots(largest_component_sizes, filename, "Size of Largest Connected Component", plot_path = plot_path) +plot_for_snapshots(largest_component_sizes, y_title="Size of Largest Connected Component", filename="./"+filename) # Average Node Engagement engagements = tgx.get_avg_node_engagement(dtdg) filename = f"{dataset.name}_average_node_engagement" -plot_for_snapshots(engagements, filename, "Average Engagement", plot_path = plot_path) +plot_for_snapshots(engagements, y_title="Average Engagement", filename="./"+filename) # Degree Density -tgx.degree_density(dtdg, k=3, network_name=dataset.name, plot_path = plot_path) +tgx.degree_density(dtdg, k=3, network_name=dataset.name) diff --git a/examples/newtest.py b/examples/newtest.py deleted file mode 100644 index cc7aac5..0000000 --- a/examples/newtest.py +++ /dev/null @@ -1,27 +0,0 @@ -import tgx -import tgx.utils.newstat as newstat -from tgx.utils.plotting_utils import plot_for_snapshots - - -plot_path = "/home/mila/e/elahe.kooshafar/projects/TGX_results" - -dataset = tgx.builtin.uci() -G = tgx.Graph(dataset) -new_G = G.discretize(time_scale="weekly")[0] - -# Number of Connected Components -newstat.connected_components_per_ts(new_G, network_name=dataset.name, plot_path = plot_path) - -# Size of Largest Connected Component -component_sizes = newstat.size_connected_components(new_G) -largest_component_sizes = [max(inner_list) if inner_list else 0 for inner_list in component_sizes] -filename = f"{dataset.name}_largest_connected_component_size" -plot_for_snapshots(largest_component_sizes, filename, "Size of Largest Connected Component", plot_path = plot_path) - -# Average Node Engagement -engagements = newstat.get_avg_node_engagement(new_G) -filename = f"{dataset.name}_average_node_engagement" -plot_for_snapshots(engagements, filename, "Average Engagement", plot_path = plot_path) - -# Degree Density -newstat.degree_density(new_G, k=3, network_name=dataset.name, plot_path = plot_path) \ No newline at end of file diff --git a/examples/test_package2.py b/examples/test_package2.py deleted file mode 100644 index 423d8a5..0000000 --- a/examples/test_package2.py +++ /dev/null @@ -1,20 +0,0 @@ -import tgx - - -# data_path = '/network/scratch/r/razieh.shirzadkhani/' -# Plot_path = "" - -dataset = tgx.builtin.uci() -# dataset = tgx.tgb_data("tgbl-wiki") -G = tgx.Graph(dataset) -new_G = G.discretize(time_scale=dataset.time_scale) -# new_G.count_freq() -tgx.TEA(new_G, network_name=dataset.name) -tgx.TET(new_G, network_name=dataset.name) -# tgx.degree_over_time(new_G, filepath= Plot_path, network_name=dataset.name) -# tgx.nodes_over_time(new_G, filepath= Plot_path, network_name=dataset.name) -# tgx.nodes_and_edges_over_time(new_G, filepath= Plot_path, network_name=dataset.name) -# tgx.get_reoccurrence(new_G) -# tgx.get_surprise(new_G) -# tgx.get_novelty(new_G) -# tgx.get_avg_node_activity(new_G) \ No newline at end of file diff --git a/tgx/utils/graph_utils.py b/tgx/utils/graph_utils.py index 25ae851..4264c4b 100644 --- a/tgx/utils/graph_utils.py +++ b/tgx/utils/graph_utils.py @@ -1,286 +1,3 @@ -<<<<<<< HEAD -import numpy as np -from typing import Union, Optional - -__all__ = ["train_test_split", - "discretize_edges", - "subsampling", - "node_list", - "is_discretized", - "frequency_count"] - -SEC_IN_MIN = 60 -SEC_IN_HOUR = 3600 -SEC_IN_DAY = 86400 -SEC_IN_WEEK = 86400 * 7 -SEC_IN_MONTH = 86400 * 30 -SEC_IN_YEAR = 86400 * 365 - -# helper function to do ceiling divison, i.e. 5/2 = 3 -def ceiling_division(n, d): - q, r = divmod(n, d) - return q + bool(r) - - - -def discretize_edges(edgelist: dict, - time_scale: Union[int,str], - store_unix: Optional[bool] = False) -> list: - """ - util function for discretizing edgelist, expected timestamp on edges are unixtimestamp - this func supports discretization of edge timestamp - 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. - 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly", the starting time of the dataset is consider the start of the first interval - Parameters: - edgelist: dict, dictionary of edges - time_scale: int or str, time interval to discretize the graph - store_unix: bool, whether to return the converted timestamps in unix format - Returns: - output list: the first item in the list is always the updated edgelist (dict, dictionary of edges with discretized timestamps) and the second item is the converted timestamps in unix format (list) if store_unix is True - """ - unique_ts = list(edgelist.keys()) - total_time = unique_ts[-1] - unique_ts[0] - if time_scale is not None: - if isinstance(time_scale, int): - interval_size = total_time // time_scale #integer timestamp of the bin, discounting any bin that has a smaller duration than others - elif isinstance(time_scale, str): - if time_scale == "minutely": - interval_size = SEC_IN_MIN - elif time_scale == "hourly": - interval_size = SEC_IN_HOUR - elif time_scale == "daily": - interval_size = SEC_IN_DAY - elif time_scale == "weekly": - interval_size = SEC_IN_WEEK - elif time_scale == "monthly": - interval_size = SEC_IN_MONTH - elif time_scale == "yearly": - interval_size = SEC_IN_YEAR - else: - raise TypeError("Invalid time interval") - else: - raise TypeError("Please provide a time interval") - - num_time_scale = ceiling_division(total_time, interval_size) - print(f'Discretizing data to {num_time_scale} timestamps...') - - updated_edgelist = {} - - if (store_unix): - unix_dict = [] - start_time = int(unique_ts[0]) - for ts, edges_list in edgelist.items(): - bin_ts = ceiling_division(ts, interval_size) #will correctly put edges into the last bin - - for edge in edges_list: - if bin_ts not in updated_edgelist: - updated_edgelist[bin_ts] = [edge] - else: - updated_edgelist[bin_ts].append(edge) - - if (store_unix): - unix_ts = start_time + int(ts // interval_size) * interval_size #round to the nearest start time - unix_ts = int(unix_ts) - unix_dict.extend([unix_ts] * len(edges_list)) - - output = [updated_edgelist] - if (store_unix): - output.append(unix_dict) - return output - - -# def edgelist_discritizer(edgelist: dict, -# time_scale: Union[str, int]): -# """ -# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp -# this func supports discretization in two different ways -# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. -# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" -# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. - -# Parameters: -# edgelist: dict, dictionary of edges -# time_scale: str or int, time interval to discretize the graph -# Returns: -# updated_edgelist: dict, dictionary of edges with discretized timestamps -# """ - -# unique_ts = list(edgelist.keys()) - -# total_time = unique_ts[-1] - unique_ts[0] -# if time_scale is not None: -# if isinstance(time_scale, str): -# if time_scale == "hourly": -# interval_size = SEC_IN_HOUR -# elif time_scale == "daily": -# interval_size = SEC_IN_DAY -# elif time_scale == "weekly": -# interval_size = SEC_IN_WEEK -# elif time_scale == "monthly": -# interval_size = SEC_IN_MONTH -# elif time_scale == "yearly": -# interval_size = SEC_IN_YEAR -# elif isinstance(time_scale, int): -# interval_size = int(total_time / (time_scale)) -# else: -# raise TypeError("Invalid time interval") -# else: -# raise TypeError("Please provide a time interval") -# num_time_scale = int(total_time/interval_size) -# print(f'Discretizing data to {num_time_scale} timestamps...') -# # if num_time_scale == 0: -# # print("Warning! Only one timestamp exist in the data.") - -# updated_edgelist = {} -# for ts, edges_list in edgelist.items(): -# bin_ts = int(ts / interval_size) -# if bin_ts >= num_time_scale: -# bin_ts -= 1 - -# for edge in edges_list: -# if bin_ts not in updated_edgelist: -# updated_edgelist[bin_ts] = [] -# updated_edgelist[bin_ts].append(edge) -# print("Discretization Done..!") -# return updated_edgelist - - - - - - - -def subsampling(graph: Union[object, dict], - node_list: Optional[list] = [], - random_selection: Optional[bool] = False, - N: Optional[int] = 100 - ) -> dict: - """ - Subsampling a part of graph by only monitoring the contacts from specific nodes' list - - Parameters: - graph: graph object or edgelist dict - node_list: list, a set of nodes to extract their contacts from the graph - random_selection: bool, wether randomly subsample a set of nodes from graph - N: int, number of nodes to be randomly sampled from graph - - Returns: - new_edgelist: dict, a dictionary of edges corresponding to nodes in the node_list - """ - print("Generate graph subsample...") - if isinstance(graph, dict): - edgelist = graph - nodes = node_list(graph) - else: - edgelist = graph.edgelist - nodes = graph.nodes() - - if random_selection: - node_list = list(np.random.choice(nodes, size = N, replace = False)) - - new_edgelist = {} - for t, edge_data in edgelist.items(): - for (u,v), f in edge_data.items(): - if u in node_list or v in node_list: - if t not in new_edgelist: - new_edgelist[t] = {} - new_edgelist[t][(u, v)] = f - else: - new_edgelist[t][(u, v)] = f - return new_edgelist - -def frequency_count(edgelist: dict): - new_edgelist = {} - - for t, edges_list in edgelist.items(): - for edge in edges_list: - (u, v) = edge - - # Check if this is the first edge occurning in this timestamp - if t not in new_edgelist: - new_edgelist[t] = {} - new_edgelist[t][(u, v)] = 1 - - else: - if (u, v) not in new_edgelist[t]: - new_edgelist[t][(u, v)] = 1 # If the edge was not occured in this timestamp before - else: - new_edgelist[t][(u, v)] += 1 - - return new_edgelist - -def node_list(dict_edgelist: dict) -> list: - - """ - create a list of nodes from edgelist dictionary - """ - node_list = {} - for _, edge_data in dict_edgelist.items(): - for (u,v), _ in edge_data.items(): - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return list(node_list.keys()) - - -def train_test_split(data : dict, - val : bool = False, - ratio : list = [85, 15]) -> dict: - """ - Generate train/test split for the data - - Parameters: - data:dictionary of data - val: whether we want to have a validation split as well - ratio: list indication the ratio of the data in split. Sum of the list components should be 100. - - Returns: - two (train/test) or three (train/val/test) data dictionaries - """ - sum = 0 - for i in ratio: - sum += i - if sum != 100: - raise ValueError("invalid train/test split ratio. Sum of the ratios should be 100.") - - if val and len(ratio) != 3: - raise Exception("Provide train/val/test ratio") - elif not val and len(ratio) == 3: - print("Warning! Data is being splitted to train and test only!") - - data_len = len(data) - train_split = int(data_len * ratio[0] / 100) - train_data = {k: v for k, v in data.items() if k < train_split} - if val: - val_split = int(data_len * ratio[1] / 100) + train_split - val_data = {k: v for k, v in data.items() if train_split <= k < val_split} - test_data = {k: v for k, v in data.items() if val_split <= k <= data_len} - return train_data, val_data, test_data - - else: - test_data = {k: v for k, v in data.items() if train_split <= k <= data_len} - return train_data, test_data - - -def is_discretized(edgelist: Optional[dict], - max_timestamps: Optional[int] = 10000) -> bool: - r""" - Check if an edgelist is discretized or not. - """ - timestamps = list(edgelist.keys()) - discretized = True - if len(timestamps) > max_timestamps: - discretized = False - - return discretized - -def list2csv(lst: list, - fname: str, - delimiter: str = ",", - fmt: str = '%i'): - out_list = np.array(lst) -======= import numpy as np from typing import Union, Optional @@ -370,68 +87,6 @@ def discretize_edges(edgelist: dict, output.append(unix_dict) return output - -# def edgelist_discritizer(edgelist: dict, -# time_scale: Union[str, int]): -# """ -# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp -# this func supports discretization in two different ways -# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. -# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" -# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. - -# Parameters: -# edgelist: dict, dictionary of edges -# time_scale: str or int, time interval to discretize the graph -# Returns: -# updated_edgelist: dict, dictionary of edges with discretized timestamps -# """ - -# unique_ts = list(edgelist.keys()) - -# total_time = unique_ts[-1] - unique_ts[0] -# if time_scale is not None: -# if isinstance(time_scale, str): -# if time_scale == "hourly": -# interval_size = SEC_IN_HOUR -# elif time_scale == "daily": -# interval_size = SEC_IN_DAY -# elif time_scale == "weekly": -# interval_size = SEC_IN_WEEK -# elif time_scale == "monthly": -# interval_size = SEC_IN_MONTH -# elif time_scale == "yearly": -# interval_size = SEC_IN_YEAR -# elif isinstance(time_scale, int): -# interval_size = int(total_time / (time_scale)) -# else: -# raise TypeError("Invalid time interval") -# else: -# raise TypeError("Please provide a time interval") -# num_time_scale = int(total_time/interval_size) -# print(f'Discretizing data to {num_time_scale} timestamps...') -# # if num_time_scale == 0: -# # print("Warning! Only one timestamp exist in the data.") - -# updated_edgelist = {} -# for ts, edges_list in edgelist.items(): -# bin_ts = int(ts / interval_size) -# if bin_ts >= num_time_scale: -# bin_ts -= 1 - -# for edge in edges_list: -# if bin_ts not in updated_edgelist: -# updated_edgelist[bin_ts] = [] -# updated_edgelist[bin_ts].append(edge) -# print("Discretization Done..!") -# return updated_edgelist - - - - - - - def subsampling(graph: Union[object, dict], node_list: Optional[list] = [], random_selection: Optional[bool] = False, @@ -562,5 +217,62 @@ def list2csv(lst: list, delimiter: str = ",", fmt: str = '%i'): out_list = np.array(lst) ->>>>>>> master - np.savetxt(fname, out_list, delimiter=delimiter, fmt=fmt) \ No newline at end of file + np.savetxt(fname, out_list, delimiter=delimiter, fmt=fmt) + + + + +# def edgelist_discritizer(edgelist: dict, +# time_scale: Union[str, int]): +# """ +# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp +# this func supports discretization in two different ways +# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. +# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" +# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. + +# Parameters: +# edgelist: dict, dictionary of edges +# time_scale: str or int, time interval to discretize the graph +# Returns: +# updated_edgelist: dict, dictionary of edges with discretized timestamps +# """ + +# unique_ts = list(edgelist.keys()) + +# total_time = unique_ts[-1] - unique_ts[0] +# if time_scale is not None: +# if isinstance(time_scale, str): +# if time_scale == "hourly": +# interval_size = SEC_IN_HOUR +# elif time_scale == "daily": +# interval_size = SEC_IN_DAY +# elif time_scale == "weekly": +# interval_size = SEC_IN_WEEK +# elif time_scale == "monthly": +# interval_size = SEC_IN_MONTH +# elif time_scale == "yearly": +# interval_size = SEC_IN_YEAR +# elif isinstance(time_scale, int): +# interval_size = int(total_time / (time_scale)) +# else: +# raise TypeError("Invalid time interval") +# else: +# raise TypeError("Please provide a time interval") +# num_time_scale = int(total_time/interval_size) +# print(f'Discretizing data to {num_time_scale} timestamps...') +# # if num_time_scale == 0: +# # print("Warning! Only one timestamp exist in the data.") + +# updated_edgelist = {} +# for ts, edges_list in edgelist.items(): +# bin_ts = int(ts / interval_size) +# if bin_ts >= num_time_scale: +# bin_ts -= 1 + +# for edge in edges_list: +# if bin_ts not in updated_edgelist: +# updated_edgelist[bin_ts] = [] +# updated_edgelist[bin_ts].append(edge) +# print("Discretization Done..!") +# return updated_edgelist diff --git a/tgx/utils/newstat.py b/tgx/utils/newstat.py deleted file mode 100644 index 1df8cae..0000000 --- a/tgx/utils/newstat.py +++ /dev/null @@ -1,163 +0,0 @@ -from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map -import networkx as nx -import numpy as np -from tgx.utils.graph_utils import train_test_split -from typing import List, Dict - -__all__ = ["connected_components_per_ts", - "size_connected_components", - "get_avg_node_engagement", - "degree_density"] - - -def degree_density(graph: tuple, k: int = 10, network_name: str = None, plot_path: str = None) -> None: - r""" - Plot density map of node degrees per time window - Parameters: - graph_edgelist: Dictionary containing graph data - k: number of time windows - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - graph_edgelist = graph.data - degrees_by_k_list = [] - temp = [] - temp_idx = 0 - unique_ts = list(graph_edgelist.keys()) - - for ts in unique_ts: - e_at_this_ts = graph_edgelist[ts] - G = nx.MultiGraph() - - for e in e_at_this_ts: - G.add_edge(e[0], e[1]) - - nodes = G.nodes() - degrees = [G.degree[n] for n in nodes] - - if temp_idx None: - r""" - Plot number of connected components per timestamp - Parameters: - graph: a list containing graph snapshots - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - num_components = [] - for t in range(len(graph.data)): - edgelist_t = graph.data[t] - nodes_t = graph.edgelist_node_list(edgelist_t) - parent = {node: node for node in nodes_t} - - for edge in edgelist_t: - (u, v) = edge - _merge(u, v, parent) - - num = 0 - for u in nodes_t: - if parent[u] == u: - num += 1 - num_components.append(num) - - if network_name is not None: - filename = f"{network_name}_connected_components_per_ts" - else: - filename = "_connected_components_per_ts" - - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - return - - -def size_connected_components(graph: tuple) -> List[List]: - r""" - Calculate the sizes of connected components per timestamp - Returns: - list[list]: A list containing lists of sizes of connected components for each timestamp. - """ - component_sizes = [] - for t in range(len(graph.data)): - edgelist_t = graph.data[t] - nodes_t = graph.edgelist_node_list(edgelist_t) - parent = {node: node for node in nodes_t} - - for edge in edgelist_t: - (u, v) = edge - _merge(u, v, parent) - - component_sizes_t = {} - for u in nodes_t: - root = _find(u, parent) - if root not in component_sizes_t: - component_sizes_t[root] = 0 - component_sizes_t[root] += 1 - - component_sizes_t_list = list(component_sizes_t.values()) - component_sizes.append(component_sizes_t_list) - - return component_sizes - - -def get_avg_node_engagement(graph: tuple) -> List[int]: - r""" - Calculate the average node engagement per timestamp, - the average number of distinct nodes that establish - at least one new connection. - Parameters: - graph_edgelist: Dictionary containing graph data - """ - engaging_nodes = [] - previous_edges = set() - - for ts in range(len(graph.data)): - edgelist_t = graph.data[ts] - new_nodes = set() - - for edge in edgelist_t: - (u, v) = edge - if frozenset({u, v}) not in previous_edges: - if u not in new_nodes: - new_nodes.add(u) - if v not in new_nodes: - new_nodes.add(v) - - engaging_nodes.append(len(new_nodes)) - previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp - - return engaging_nodes \ No newline at end of file diff --git a/tgx/utils/plotting_utils.py b/tgx/utils/plotting_utils.py index 3c3efad..f23df2c 100644 --- a/tgx/utils/plotting_utils.py +++ b/tgx/utils/plotting_utils.py @@ -28,8 +28,7 @@ def create_ts_list(start, end, metric=None, interval=None): def plot_nodes_edges_per_ts(edges: list, nodes: list, ts: list, - network_name: str, - plot_path: str = None, + filename: str = None, ylabel_1: str = 'Edges per Timestamp', ylabel_2: str = 'Nodes per Timestamp'): """ @@ -38,8 +37,7 @@ def plot_nodes_edges_per_ts(edges: list, edges: A list containing number of edges per timestamp nodes: A list containing number of nodes per timestamp ts: list of timestamps - network_name: Name of the network to be used in the output file name - plot_path: Path to save the output figure + filename: Name of the output file name, containing the path ylabel_1: Label for the edges per timestamp line ylabel_2: Label for the nodes per timestamp line """ @@ -59,10 +57,10 @@ def plot_nodes_edges_per_ts(edges: list, ax1.set_ylim(0) ax2.set_ylim(0) ax1.set_xlim(0, len(ts)-1) - if plot_path is not None: - filename = f"{network_name}_node&edge_per_ts" - plt.savefig(f'{plot_path}/{filename}') - plt.show() + if filename is not None: + plt.savefig(f'{filename}') + else: + plt.show() def plot_for_snapshots(data: list, y_title: str, @@ -72,10 +70,9 @@ def plot_for_snapshots(data: list, Plot a variable for different timestamps Parameters: data: A list of desired variable to be plotted - filename: Name of the output file name y_title: Title of the y axis + filename: Name of the output file name, containing the path show_ave: Whether to plot a line showing the average of the variable over all timestamps - plot_path: The path to save the output file ''' ts = list(range(0, len(data))) # plt.rcParams["font.family"] = "Times New Roman" @@ -96,9 +93,15 @@ def plot_for_snapshots(data: list, plt.show() -def plot_density_map(data, filename, y_title, plot_path=None): +def plot_density_map(data: list, + y_title: str, + filename: str = None,): ''' Plot a density map using fig and ax + Parameters: + data: A list of desired variable to be plotted + y_title: Title of the y axis + filename: Name of the output file name, containing the path ''' max_value = max(max(inner) for inner in data if inner) c = np.zeros((max_value, len(data))) @@ -125,9 +128,10 @@ def plot_density_map(data, filename, y_title, plot_path=None): # Adjust the aspect ratio of the plot ax.set_aspect('auto') - if plot_path is not None: - plt.savefig(f'{plot_path}/{filename}') - plt.show() + if filename is not None: + plt.savefig(f'{filename}') + else: + plt.show() if __name__ == "__main__": create_ts_list(86400, 86400*365, "unix", "month") diff --git a/tgx/utils/stat.py b/tgx/utils/stat.py index a2efa85..b1dc5e4 100644 --- a/tgx/utils/stat.py +++ b/tgx/utils/stat.py @@ -1,7 +1,7 @@ from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map import networkx as nx import numpy as np -from tgx.utils.graph_utils import train_test_split +from typing import List __all__ = ["degree_over_time", "nodes_over_time", @@ -15,6 +15,8 @@ "get_surprise", "get_novelty", "get_avg_node_activity", + "connected_components_per_ts", + "size_connected_components", "get_avg_node_engagement", "degree_density"] @@ -34,33 +36,30 @@ def _merge(x, y, parent): parent[root_x] = root_y - def degree_over_time(graph: object, network_name: str, - filepath: str = ".") -> None: + filepath: str = "./") -> None: r''' Plot average degree per timestamp. Parameters: graph: Graph object created by tgx.Graph containing edgelist - total_nodes: number of nodes that appear through all the snapshots network_name: name of the graph to be used in the output file name filepath: path to save the output figure ''' - print("Plotting average degree per timestamp") ave_degree = _calculate_average_degree_per_ts(graph) if network_name is not None: filename = f"{network_name}_ave_degree_per_ts" else: filename = "ave_degree_per_ts" - plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = filepath) + plot_for_snapshots(ave_degree, y_title= "Average degree", filename=filepath+filename) return def nodes_over_time(graph: object, network_name: str, - filepath: str = ".") -> None: + filepath: str = "./") -> None: r''' Plot number of active nodes per timestamp. @@ -69,19 +68,17 @@ def nodes_over_time(graph: object, network_name: name of the graph to be used in the output file name filepath: path to save the output figure ''' - print("Plotting number of nodes per timestamp.") active_nodes = _calculate_node_per_ts(graph) if network_name is not None: filename = f"{network_name}_nodes_per_ts" else: filename = "nodes_per_ts" - plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = filepath) + plot_for_snapshots(active_nodes, y_title="Number of nodes", filename=filepath+filename) return def edges_over_time(graph: object, - plot_path: str = None, network_name: str = None, - filepath: str = ".") -> None: + filepath: str = "./") -> None: r''' Plot number of edges per timestamp. Parameters: @@ -89,18 +86,17 @@ def edges_over_time(graph: object, network_name: name of the graph to be used in the output file name filepath: path to save the output figure ''' - print("Plotting number of edges per timestamp.") active_edges = _calculate_edge_per_ts(graph) if network_name is not None: filename = f"{network_name}_edges_per_ts" else: filename = "_edges_per_ts" - plot_for_snapshots(active_edges, plot_path, filename, "Number of edges", plot_path = filepath) + plot_for_snapshots(active_edges, y_title="Number of edges", filename=filepath+filename) return def nodes_and_edges_over_time(graph: object, network_name: str , - filepath: str = "."): + filepath: str = "./"): r""" Plot number of nodes per timestamp and number of edges per timestamp in one fiugre. Parameters: @@ -112,9 +108,11 @@ def nodes_and_edges_over_time(graph: object, edges = _calculate_edge_per_ts(graph) nodes = _calculate_node_per_ts(graph) ts = list(range(0, len(graph.data))) - - - return plot_nodes_edges_per_ts(edges, nodes, ts, network_name = network_name, plot_path = filepath) + if network_name is not None: + filename = f"{network_name}_node_and_edges_per_ts" + else: + filename = "node_and_edges_per_ts" + return plot_nodes_edges_per_ts(edges, nodes, ts, filename=filepath+filename) @@ -377,10 +375,17 @@ def get_avg_node_engagement(graph: object): previous_edges = {frozenset({u, v}) for (u, v) in e_list} # Update the set of previous edges for the next timestamp return engaging_nodes - -def degree_density(graph: object, network_name: str = None, k = 10, plot_path: str = None) -> None: +def degree_density(graph: tuple, + k: int = 10, + network_name: str = None, + plot_path: str = "./") -> None: r""" - plot the density map of node degrees over timestamps + Plot density map of node degrees per time window + Parameters: + graph_edgelist: Dictionary containing graph data + k: number of time windows + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure """ graph_edgelist = graph.data degrees_by_k_list = [] @@ -414,13 +419,13 @@ def degree_density(graph: object, network_name: str = None, k = 10, plot_path: s else: filename = "_degree_density" - plot_density_map(degrees_by_k_list, filename, "Node Degree", plot_path = plot_path) + plot_density_map(degrees_by_k_list, y_title="Node Degree", filename = plot_path + filename) return def connected_components_per_ts(graph: tuple, network_name: str = None, - plot_path: str = None) -> None: + plot_path: str = "./") -> None: r""" Plot number of connected components per timestamp Parameters: @@ -449,5 +454,64 @@ def connected_components_per_ts(graph: tuple, else: filename = "_connected_components_per_ts" - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - return \ No newline at end of file + plot_for_snapshots(num_components, y_title="Number of connected components", filename=plot_path+filename) + return + + +# TODO turn this into a plotting function as well, can return the computed stats +def size_connected_components(graph: tuple) -> List[List]: + r""" + Calculate the sizes of connected components per timestamp + Returns: + list[list]: A list containing lists of sizes of connected components for each timestamp. + """ + component_sizes = [] + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} + + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) + + component_sizes_t = {} + for u in nodes_t: + root = _find(u, parent) + if root not in component_sizes_t: + component_sizes_t[root] = 0 + component_sizes_t[root] += 1 + + component_sizes_t_list = list(component_sizes_t.values()) + component_sizes.append(component_sizes_t_list) + + return component_sizes + +# TODO turn this into a plotting function as well, can return the computed stats +def get_avg_node_engagement(graph: tuple) -> List[int]: + r""" + Calculate the average node engagement per timestamp, + the average number of distinct nodes that establish + at least one new connection. + Parameters: + graph_edgelist: Dictionary containing graph data + """ + engaging_nodes = [] + previous_edges = set() + + for ts in range(len(graph.data)): + edgelist_t = graph.data[ts] + new_nodes = set() + + for edge in edgelist_t: + (u, v) = edge + if frozenset({u, v}) not in previous_edges: + if u not in new_nodes: + new_nodes.add(u) + if v not in new_nodes: + new_nodes.add(v) + + engaging_nodes.append(len(new_nodes)) + previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp + + return engaging_nodes \ No newline at end of file diff --git a/tgx/viz/TEA.py b/tgx/viz/TEA.py index 57b130c..e684f4d 100644 --- a/tgx/viz/TEA.py +++ b/tgx/viz/TEA.py @@ -1,224 +1,3 @@ -<<<<<<< HEAD -import pandas as pd -import matplotlib.pyplot as plt -from typing import Union, Optional -from tgx.utils.graph_utils import discretize_edges -from tgx.utils.plotting_utils import create_ts_list -__all__ = ["TEA"] - -def TEA( - temp_edgelist : Union[object, dict], - filepath : Optional[str] = ".", - fig_size : tuple = (7,5), - font_size : int = 20, - network_name : str = None, - time_scale : Union[str, int] = None, - real_dates : bool = None, - test_split : bool = False, - density : bool = False - ): - r""" - generating TEA plot - - Parameters: - temp_edgelist: a dictionary of temporal edges or a dataset object. - filepath: Path to save the TEA Plot. - fig_size: Size of the figure to save. - font_size: Size of the text in the figure. - network_name: Name of the dataset to be used in the TEA plot file. - time_scale: time_scale for discretizing data if already not done. - real_dates: Whether to use the real dates from dataset. - test_split: Whether show the test split on the plot. - density: Whether to return edge density and edge frequency dictioneries. - """ - if isinstance(temp_edgelist, object): - if temp_edgelist.freq_data is None: - temp_edgelist.count_freq() - temp_edgelist = temp_edgelist.freq_data - - # check number of unique timestamps: - unique_ts = list(temp_edgelist.keys()) - # if len(unique_ts) > max_time_scale: - # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() - # if inp == "y": - # temp_edgelist = edgelist_discritizer(temp_edgelist, - # unique_ts, - # time_scale = max_time_scale) - if time_scale is not None: - temp_edgelist = discretize_edges(temp_edgelist, - time_scale = time_scale) - - - ts_edges_dist, ts_edges_dist_density, edge_frequency_dict = TEA_process_edgelist_per_timestamp(temp_edgelist) - - TEA_plot_edges_bar(ts_edges_dist, - filepath = filepath, - fig_size = fig_size, - font_size = font_size, - network_name=network_name, - real_dates = real_dates, - test_split = test_split) - - if density: - return ts_edges_dist_density, edge_frequency_dict - - - -def TEA_process_edgelist_per_timestamp(temp_edgelist): - # generate distribution of the edges history - unique_ts = list(temp_edgelist.keys()) - # unique_ts.sort() - # print(f"There are {len(unique_ts)} timestamps.") - - # get node set & total number of nodes - node_dict = {} - for t, e_dict in temp_edgelist.items(): - for e, exist in e_dict.items(): - if e[0] not in node_dict: - node_dict[e[0]] = 1 - if e[1] not in node_dict: - node_dict[e[1]] = 1 - num_nodes = len(node_dict) - num_e_fully_connected = num_nodes * (num_nodes - 1) - - edge_frequency_dict = {} # how many times an edge is seen - ts_edges_dist = [] # contains different features specifying the characteristics of the edge distribution over time - ts_edges_dist_density = [] - for curr_t in unique_ts: - - # if curr_t < 2: - # print("curr_t", curr_t) - prev_ts = [ts for ts in unique_ts if ts < curr_t] - edges_in_prev_ts = {} - for bts in prev_ts: - edges_in_prev_ts.update(temp_edgelist[bts]) - - curr_ts_edge_list = temp_edgelist[curr_t] - for e in curr_ts_edge_list: - if e not in edge_frequency_dict: - edge_frequency_dict[e] = 1 - else: - edge_frequency_dict[e] += 1 - - if len(curr_ts_edge_list) > 0: - curr_ts_edges_dist = {'ts': curr_t, - 'new': len([e for e in curr_ts_edge_list if e not in edges_in_prev_ts]), - 'repeated': len([e for e in curr_ts_edge_list if e in edges_in_prev_ts]), - 'not_repeated': len([e for e in edges_in_prev_ts if e not in curr_ts_edge_list]), - 'total_curr_ts': len(curr_ts_edge_list), - 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) - } - curr_ts_edges_dist_density = {'ts': curr_t, - 'new': (curr_ts_edges_dist['new'] * 1.0) / num_e_fully_connected, - 'repeated': (curr_ts_edges_dist['repeated'] * 1.0) / num_e_fully_connected, - 'not_repeated': (curr_ts_edges_dist[ - 'not_repeated'] * 1.0) / num_e_fully_connected, - 'total_curr_ts': (curr_ts_edges_dist[ - 'total_curr_ts'] * 1.0) / num_e_fully_connected, - 'total_seen_until_curr_ts': (curr_ts_edges_dist[ - 'total_seen_until_curr_ts'] * 1.0) / num_e_fully_connected, - } - else: - curr_ts_edges_dist = {'ts': curr_t, - 'new': 0, - 'repeated': 0, - 'not_repeated': 0, - 'total_curr_ts': 0, - 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) - } - curr_ts_edges_dist_density = {'ts': curr_t, - 'new': 0, - 'repeated': 0, - 'not_repeated': 0, - 'total_curr_ts': 0, - 'total_seen_until_curr_ts': 0, - } - ts_edges_dist.append(curr_ts_edges_dist) - ts_edges_dist_density.append(curr_ts_edges_dist_density) - # print(len(edges_in_prev_ts)) - # print(len(ts_edges_dist)) - # print(edge_frequency_dict) - # break - return ts_edges_dist, ts_edges_dist_density, edge_frequency_dict - - -def TEA_plot_edges_bar(ts_edges_dist: list, - filepath: str = ".", - fig_size: list = (9,5), - font_size: int = 20, - network_name: str = None, - real_dates: list = None, - time_scale: list = None, - test_split: bool = False, - show: bool =False): - r""" - Making TEA plot and save into pdf file. - Args: - ts_edges_dist: list of dictionaries containing the edge distribution over time. - filepath: Path to save the TEA Plot. - fig_size: Size of the figure to save. - font_size: Size of the text in the figure. - network_name: Name of the dataset to be used in the TEA plot file. - real_dates: list of real dates as ticks - time_scale: time_scale for discretizing data if already not done. - test_split: Whether show the test split on the plot. - show: Whether to show the plot. - """ - - - ts_edges_dist_df = pd.DataFrame(ts_edges_dist, columns=['ts', 'new', 'repeated', - 'not_repeated', - 'total_curr_ts', - 'total_seen_until_curr_ts']) - - - ### Additional Stats ### - mean = ts_edges_dist_df.mean(axis=0) - # print("INFO: Network Name:", network_name) - # print("INFO: AVG. stats. over all timestamps: ", mean) - # print("INFO: ratio of avg.(new)/avg.(total_curr_ts): {:.2f}".format(mean['new'] / mean['total_curr_ts'])) - ### - - fig, ax = plt.subplots(figsize=fig_size) # lastfm, mooc, reddit, UNtrade, UNvote - plt.subplots_adjust(bottom=0.2, left=0.2) - font_size = font_size - ticks_font_size = 15 - plt.yticks(fontsize=ticks_font_size) - plt.xticks(fontsize=ticks_font_size) - if real_dates is not None: - start = real_dates[0] - end = real_dates[1] - metric = real_dates[2] - create_ts_list(start, end, metric=metric, interval=time_scale) - else: - duration = ts_edges_dist_df['ts'].tolist() - timestamps = [i for i in range(len(duration))] - - new = ts_edges_dist_df['new'].tolist() - repeated = ts_edges_dist_df['repeated'].tolist() - # print(len(timestamps), repeated, new) - # plotting stuffs - # bar plot - plt.bar(timestamps, repeated, label='Repeated', color='#404040', alpha=0.4) - plt.bar(timestamps, new, label='New', bottom=repeated, color='#ca0020', alpha=0.8, hatch='//') - # test split line - if test_split: - plt.axvline(x=(timestamps[int(0.85 * len(timestamps))]), color="blue", linestyle="--", linewidth=2) - plt.text((timestamps[int(0.85 * len(timestamps))]), 0, - 'x', va='center', ha='center', fontsize=font_size, fontweight='heavy', color='blue') - - plt.margins(x=0) - plt.xlabel("Timestamp", fontsize=font_size) - plt.ylabel("Number of edges", fontsize=font_size) - plt.legend(fontsize = 13) - if filepath is not None: - plt.savefig(f"{filepath}/{network_name}_TEA.pdf") - print("plot saved as " + f"{filepath}/{network_name}_TEA.pdf") - if (show): - plt.show() - - -======= import pandas as pd import matplotlib.pyplot as plt from typing import Union, Optional @@ -438,4 +217,3 @@ def TEA_plot_edges_bar(ts_edges_dist: list, plt.show() ->>>>>>> master diff --git a/tgx/viz/TET.py b/tgx/viz/TET.py index 15cbda7..52bb2a9 100644 --- a/tgx/viz/TET.py +++ b/tgx/viz/TET.py @@ -1,340 +1,3 @@ -<<<<<<< HEAD -# TET Plot -import numpy as np -import pandas as pd -import seaborn as sns -from tqdm import tqdm -from typing import Union, Optional -import matplotlib.pyplot as plt -from tgx.utils.graph_utils import discretize_edges - - -# some parameters to be used for drawing -E_ABSENT = 0 -E_PRESENCE_GENERAL = 1 -E_SEEN_IN_TRAIN = 2 -E_IN_TEST = 3 -E_NOT_IN_TEST = 4 - -TEST_RATIO = 0.15 - -# new color controlling parameters; Date: Dec. 22, 2021 -E_ONLY_TRAIN = 10 -E_TRAIN_AND_TEST = 20 -E_TRANSDUCTIVE = 30 -E_INDUCTIVE = 40 - - -#! should be merged graph class? -def TET(temp_edgelist : Union[object, dict], - filepath: Optional[str] = ".", - time_scale : Union[str, int] = None, - network_name : str = None, - add_frame : bool = True, - test_split : bool = False, - figsize : tuple = (9, 5), - axis_title_font_size : int = 20, - ticks_font_size : int = 20, - show: bool = True): - r""" - Generate TET plots - Args: - temp_edgelist: a dictionary of temporal edges or a dataset object. - filepath: Path to save the TEA Plot. - figsize: Size of the figure to save. - axis_title_font_size: The font size of xis titles. - ticks_font_size: Size of the text in the figure. - add_frame: Add the frame to the plot. - network_name: Name of the dataset to be used in the TEA plot file. - time_scale: time_scale for discretizing data if already not done. - test_split: Whether show the test split on the plot. - max_time_scale: Maximum number of time_scale to discretize data. - show: Whether to show the plot. - """ - if isinstance(temp_edgelist, object): - if temp_edgelist.freq_data is None: - temp_edgelist.count_freq() - temp_edgelist = temp_edgelist.freq_data - - # check number of unique timestamps: - unique_ts = list(temp_edgelist.keys()) - # if len(unique_ts) > max_time_scale: - # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() - # if inp == "y": - # temp_edgelist = edgelist_discritizer(temp_edgelist, - # unique_ts, - # time_scale = max_time_scale) - if time_scale is not None: - temp_edgelist = discretize_edges(temp_edgelist, - time_scale = time_scale) - - edge_last_ts = generate_edge_last_timestamp(temp_edgelist) - edge_idx_map = generate_edge_idx_map(temp_edgelist, edge_last_ts) - idx_edge_map = {v: k for k, v in edge_idx_map.items()} # key: edge index; value: actual edge (source, destination) - print("Info: Number of distinct edges (from index-edge map): {}".format(len(idx_edge_map))) - - unique_ts_list = list(temp_edgelist.keys()) - e_presence_mat = generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, temp_edgelist) - print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) - # print(np.unique(e_presence_mat, return_counts=True)) - e_presence_mat, test_split_ts_value = process_presence_matrix(e_presence_mat, test_ratio_p=0.85) - print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) - # print(np.unique(e_presence_mat, return_counts=True)) - fig_param = set_fig_param(network_name, - fig_name = filepath, - figsize = figsize, - axis_title_font_size = axis_title_font_size, - ticks_font_size = ticks_font_size) - - plot_edge_presence_matrix(e_presence_mat, test_split_ts_value, unique_ts_list, list(idx_edge_map.keys()), - fig_param, test_split = test_split, add_frames=add_frame, show=show) - return - - -def generate_edge_last_timestamp(edges_per_ts): - """generates a dictionary containing the last timestamp of each edge""" - edge_last_ts = {} - for ts, e_list in edges_per_ts.items(): - for e in e_list: - if e not in edge_last_ts: - edge_last_ts[e] = ts - else: - edge_last_ts[e] = max(ts, edge_last_ts[e]) - return edge_last_ts - - -def generate_edge_idx_map(edges_per_ts, edge_last_ts): - """ - generates index for edges according to two-level sorting policy: - 1. the first level is based on their first appearance timestamp - 2. the second level is based on their last appearance timestamp - """ - edge_idx_map = {} # key: actual edge (source, destination), value: edge index - distinct_edge_idx = 0 - for ts, ts_e_list in edges_per_ts.items(): - e_last_ts_this_timestamp = {} - for e in ts_e_list: - e_last_ts_this_timestamp[e] = edge_last_ts[e] - e_last_ts_this_timestamp = dict(sorted(e_last_ts_this_timestamp.items(), key=lambda item: item[1])) - for e in e_last_ts_this_timestamp: - if e not in edge_idx_map: - edge_idx_map[e] = distinct_edge_idx - distinct_edge_idx += 1 - - return edge_idx_map - - -def generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, edges_per_ts): - ''' - Returns presence matrix with values 0 and 1 which indicate: - value = 0 : edge is not present in this timestamp - value = 1 : edge is present in this timestamp - - shape: (ts, total number of edges) - ''' - num_unique_ts = len(unique_ts_list) - num_unique_edge = len(idx_edge_map) - e_presence_mat = np.zeros([num_unique_ts, num_unique_edge], dtype=np.int8) - unique_ts_list = np.sort(unique_ts_list) - - for x, ts in tqdm(enumerate(unique_ts_list)): - es_ts = edges_per_ts[ts] - for e in es_ts: - e_presence_mat[num_unique_ts - x - 1, edge_idx_map[e]] = E_PRESENCE_GENERAL - - return e_presence_mat - -def process_presence_matrix(e_presence_matrix, test_ratio_p): - """ - there are 4 types of edge presence: - 1. only in train - 2. in train and in test - 3. in test and train (which is the number 2 but in later timestamps) - 4. only in test - X: timestamp - Y: edge index - """ - num_unique_ts = e_presence_matrix.shape[0] - num_unique_edges = e_presence_matrix.shape[1] - ts_idx_list = [i for i in range(num_unique_ts)] - - # generating timestamp list for train and test: - test_split_ts_value = int(np.quantile(ts_idx_list, test_ratio_p)) - train_ts_list = [ts for ts in ts_idx_list if ts <= test_split_ts_value] # any timestamp in train/validation split - test_ts_list = [ts for ts in ts_idx_list if ts > test_split_ts_value] # test_split_ts_value is in train - - # first level processing: differentiate train set edges: 1) Only in train set, 2) in train & test set - print("First level processing: ") - print("Detecting edges present in train & test sets") - for tr_ts in tqdm(train_ts_list): - for eidx in range(num_unique_edges): - if e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] == E_PRESENCE_GENERAL: - for test_ts_idx in range(test_split_ts_value + 1, num_unique_ts): - if e_presence_matrix[num_unique_ts - test_ts_idx - 1, eidx] == E_PRESENCE_GENERAL: # if seen in - # the test set - e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] = E_TRAIN_AND_TEST - break - - # differentiate test set edges: 1) transductive (seen in train, repeating in test), 2) inductive (only in test) - print("Detecting transductive edges (seen in train, repeating in test)") - for ts in tqdm(test_ts_list): - for eidx in range(num_unique_edges): - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - for prev_ts_idx in range(test_split_ts_value, -1, -1): - if e_presence_matrix[num_unique_ts - prev_ts_idx - 1, eidx] == E_TRAIN_AND_TEST: # if seen in - # the training set - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_TRANSDUCTIVE - break - - # second level processing - print("Second level processing:") - print("Detecting edges 1) Only in train set, 2) only in test (inductive)") - for ts in tqdm(range(num_unique_ts)): - for eidx in range(num_unique_edges): - if ts <= test_split_ts_value: - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_ONLY_TRAIN - else: - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_INDUCTIVE - - return e_presence_matrix, test_split_ts_value - - -def plot_edge_presence_matrix(e_presence_mat, - test_split_ts_value, - unique_ts_list, - idx_edge_list, - fig_param, - test_split = False, - add_frames=True, - show=False): - print("Info: plotting edge presence heatmap for {} ...".format(fig_param.fig_name)) - - fig, ax = plt.subplots(figsize=fig_param.figsize) - plt.subplots_adjust(bottom=0.3, left=0.2) - - # colors = ['white', # E_ABSENCE - # '#67a9cf', # E_ONLY_TRAIN - # '#ef8a62', # E_TRAIN_AND_TEST - # '#ef8a62', # E_TRANSDUCTIVE - # '#b2182b' # E_INDUCTIVE - # ] - if test_split: - colors = ['white', # E_ABSENCE - '#018571', # E_ONLY_TRAIN 2c7bb6 - '#fc8d59', # E_TRAIN_AND_TEST - '#fc8d59', # E_TRANSDUCTIVE - '#b2182b' # E_INDUCTIVE - ] - else: - colors = ['white', - '#ca0020', - '#ca0020', - '#ca0020', - '#ca0020',] - # print(sns.color_palette(colors, as_cmap=True)) - frame_color = "grey" # "#bababa" - time_split_color = "black" - axis_title_font_size = fig_param.axis_title_font_size - x_font_size = fig_param.ticks_font_size - y_font_size = fig_param.ticks_font_size - - ax = sns.heatmap(e_presence_mat, cmap=sns.color_palette(colors, as_cmap=True), cbar=False) - - # processing x-axis - x_gaps = np.linspace(0, len((idx_edge_list)), num=5) - x_labels = x_gaps / len(idx_edge_list) - x_labels = [int(100*x) for x in x_labels] - plt.xticks(x_gaps, x_labels, rotation=0, fontsize=x_font_size) - - # processing y-axis - t_gaps = np.linspace(0, len(unique_ts_list), num=5) - t_labels = [int(len(unique_ts_list) - tidx) for tidx in t_gaps] - plt.yticks(t_gaps, t_labels, rotation=90, fontsize=y_font_size) - - # axis & title - # plt.margins(x=0) - plt.xlabel("Percentage of observed edges", fontsize=axis_title_font_size) - plt.ylabel("Timestamp", fontsize=axis_title_font_size) - - # requirements for additional features - x_length = e_presence_mat.shape[1] - 1 - y_length = e_presence_mat.shape[0] - 1 - test_split_idx_value = y_length - test_split_ts_value - e_border_idx = 0 - for e_idx in range(e_presence_mat.shape[1] - 1, -1, -1): - if e_presence_mat[y_length - test_split_ts_value, e_idx] != E_ABSENT: - e_border_idx = e_idx - break - - # rectangle for different parts of the dataset - if add_frames and test_split: - print("Info: Border edge index:", e_border_idx) - print("Info: Test split timestamp value:", test_split_ts_value) - rect_train = plt.Rectangle((0, y_length - test_split_ts_value + 0.085), e_border_idx, test_split_ts_value + 0.9, - fill=False, linewidth=2, edgecolor=frame_color) - rect_test_mayseen = plt.Rectangle((0, 0), e_border_idx, y_length - test_split_ts_value - 0.1, - fill=False, linewidth=2, edgecolor=frame_color) - rect_test_new = plt.Rectangle((e_border_idx, 0), x_length - e_border_idx, - y_length - test_split_ts_value - 0.1, - fill=False, linewidth=2, edgecolor=frame_color) - ax = ax or plt.gca() - ax.add_patch(rect_train) - ax.add_patch(rect_test_mayseen) - ax.add_patch(rect_test_new) - - elif add_frames: - ax.add_patch(plt.Rectangle((0, 0), x_length, y_length+1, - fill=False, linewidth=2, edgecolor=frame_color)) - # test split horizontal line - if test_split: - plt.axhline(y=test_split_idx_value, color=time_split_color, linestyle="--", linewidth=2, label='x') - plt.text(x=0, y=test_split_idx_value, s='x', color=time_split_color, va='center', ha='center', - fontsize=y_font_size, fontweight='heavy') - - if fig_param.fig_name is not None: - # print("Info: file name: {}".format(fig_param.fig_name)) - plt.savefig(f"{fig_param.fig_name}/{fig_param.network_name}_TET.pdf") - plt.show() - print("Info: plotting done!") - -def set_fig_param(network_name, fig_name = None, - figsize = (9, 5), - axis_title_font_size = 20, - ticks_font_size = 22, - axis_tick_gap = 20, - timestamp_split_cross_mark_offset = 1): - - # if network_name in ['US Legislative', 'Canadian Vote', 'UN Trade', 'UN Vote']: - # axis_tick_gap = axis_tick_gap * 0.35 - - # elif network_name in ['Reddit', 'Wikipedia', 'UCI', 'Social Evo.', 'Flights', 'LastFM', 'MOOC']: - # axis_tick_gap = axis_tick_gap * 0.5 - - # elif network_name in ['Enron']: - # axis_tick_gap = axis_tick_gap * 0.4 - - fig_param = Fig_Param(network_name, - fig_name, - figsize, - axis_title_font_size, - ticks_font_size, - axis_tick_gap, - timestamp_split_cross_mark_offset) - - return fig_param - -class Fig_Param: - def __init__(self, network_name, fig_name, figsize, axis_title_font_size, ticks_font_size, axis_tick_gap, - timestamp_split_cross_mark_offset): - self.network_name = network_name - self.fig_name = fig_name - self.figsize = figsize - self.axis_title_font_size = axis_title_font_size - self.ticks_font_size = ticks_font_size - self.axis_tick_gap = axis_tick_gap -======= # TET Plot import numpy as np import pandas as pd @@ -670,5 +333,4 @@ def __init__(self, network_name, fig_name, figsize, axis_title_font_size, ticks_ self.axis_title_font_size = axis_title_font_size self.ticks_font_size = ticks_font_size self.axis_tick_gap = axis_tick_gap ->>>>>>> master self.timestamp_split_cross_mark_offset = timestamp_split_cross_mark_offset \ No newline at end of file