diff --git a/docs/tutorials/Features.ipynb b/docs/tutorials/Features.ipynb index 97bb18b..b0ec753 100644 --- a/docs/tutorials/Features.ipynb +++ b/docs/tutorials/Features.ipynb @@ -423,7 +423,7 @@ "outputs": [], "source": [ "node_engagement = get_avg_node_engagement(data)\n", - "plot_for_snapshots(node_engagement, filename=f\"{dataset.name}_avg_node_engagement_per_ts\", y_title=\"node engagement\")" + "plot_for_snapshots(node_engagement, y_title=\"node engagement\", filename=\"./\" + f\"{dataset.name}_avg_node_engagement_per_ts\")" ] }, { @@ -447,7 +447,7 @@ "outputs": [], "source": [ "node_activity = get_avg_node_activity(data)\n", - "plot_for_snapshots(node_activity, filename=f\"{dataset.name}_avg_node_activity_per_ts\", y_title=\"node activity\")" + "plot_for_snapshots(node_activity, y_title=\"node activity\", filename=\"./\" + f\"{dataset.name}_avg_node_activity_per_ts\")" ] } ], diff --git a/docs/tutorials/data_loader.ipynb b/docs/tutorials/data_loader.ipynb index f2bd485..1975875 100644 --- a/docs/tutorials/data_loader.ipynb +++ b/docs/tutorials/data_loader.ipynb @@ -256,7 +256,7 @@ } ], "source": [ - "from tgx.utils.graph_stat import get_avg_node_engagement\n", + "from tgx.utils.stat import get_avg_node_engagement\n", "\n", "node_engagement = get_avg_node_engagement(data)\n", "filename = f\"{dataset.name}_ave_node_engagement_per_ts\"\n", diff --git a/examples/data_viz.py b/examples/data_viz.py index 726e416..44e30aa 100644 --- a/examples/data_viz.py +++ b/examples/data_viz.py @@ -18,7 +18,7 @@ dtdg = ctdg.discretize(time_scale=time_scale)[0] -#! plotting the statistics, works +#* plotting the statistics tgx.degree_over_time(dtdg, network_name=dataset.name) tgx.nodes_over_time(dtdg, network_name=dataset.name) tgx.edges_over_time(dtdg, network_name=dataset.name) @@ -36,27 +36,26 @@ -#! compute statistics +#* compute statistics test_ratio = 0.15 tgx.get_reoccurrence(ctdg, test_ratio=test_ratio) tgx.get_surprise(ctdg, test_ratio=test_ratio) -tgx.get_avg_node_activity(dtdg) tgx.get_novelty(dtdg) # Number of Connected Components -tgx.connected_components_per_ts(dtdg, network_name=dataset.name, plot_path = plot_path) +tgx.connected_components_per_ts(dtdg, network_name=dataset.name) # Size of Largest Connected Component component_sizes = tgx.size_connected_components(dtdg) largest_component_sizes = [max(inner_list) if inner_list else 0 for inner_list in component_sizes] filename = f"{dataset.name}_largest_connected_component_size" -plot_for_snapshots(largest_component_sizes, filename, "Size of Largest Connected Component", plot_path = plot_path) +plot_for_snapshots(largest_component_sizes, y_title="Size of Largest Connected Component", filename="./"+filename) # Average Node Engagement engagements = tgx.get_avg_node_engagement(dtdg) filename = f"{dataset.name}_average_node_engagement" -plot_for_snapshots(engagements, filename, "Average Engagement", plot_path = plot_path) +plot_for_snapshots(engagements, y_title="Average Engagement", filename="./"+filename) # Degree Density -tgx.degree_density(dtdg, k=3, network_name=dataset.name, plot_path = plot_path) +tgx.degree_density(dtdg, k=3, network_name=dataset.name) diff --git a/examples/newtest.py b/examples/newtest.py deleted file mode 100644 index cc7aac5..0000000 --- a/examples/newtest.py +++ /dev/null @@ -1,27 +0,0 @@ -import tgx -import tgx.utils.newstat as newstat -from tgx.utils.plotting_utils import plot_for_snapshots - - -plot_path = "/home/mila/e/elahe.kooshafar/projects/TGX_results" - -dataset = tgx.builtin.uci() -G = tgx.Graph(dataset) -new_G = G.discretize(time_scale="weekly")[0] - -# Number of Connected Components -newstat.connected_components_per_ts(new_G, network_name=dataset.name, plot_path = plot_path) - -# Size of Largest Connected Component -component_sizes = newstat.size_connected_components(new_G) -largest_component_sizes = [max(inner_list) if inner_list else 0 for inner_list in component_sizes] -filename = f"{dataset.name}_largest_connected_component_size" -plot_for_snapshots(largest_component_sizes, filename, "Size of Largest Connected Component", plot_path = plot_path) - -# Average Node Engagement -engagements = newstat.get_avg_node_engagement(new_G) -filename = f"{dataset.name}_average_node_engagement" -plot_for_snapshots(engagements, filename, "Average Engagement", plot_path = plot_path) - -# Degree Density -newstat.degree_density(new_G, k=3, network_name=dataset.name, plot_path = plot_path) \ No newline at end of file diff --git a/examples/test_package2.py b/examples/test_package2.py deleted file mode 100644 index 423d8a5..0000000 --- a/examples/test_package2.py +++ /dev/null @@ -1,20 +0,0 @@ -import tgx - - -# data_path = '/network/scratch/r/razieh.shirzadkhani/' -# Plot_path = "" - -dataset = tgx.builtin.uci() -# dataset = tgx.tgb_data("tgbl-wiki") -G = tgx.Graph(dataset) -new_G = G.discretize(time_scale=dataset.time_scale) -# new_G.count_freq() -tgx.TEA(new_G, network_name=dataset.name) -tgx.TET(new_G, network_name=dataset.name) -# tgx.degree_over_time(new_G, filepath= Plot_path, network_name=dataset.name) -# tgx.nodes_over_time(new_G, filepath= Plot_path, network_name=dataset.name) -# tgx.nodes_and_edges_over_time(new_G, filepath= Plot_path, network_name=dataset.name) -# tgx.get_reoccurrence(new_G) -# tgx.get_surprise(new_G) -# tgx.get_novelty(new_G) -# tgx.get_avg_node_activity(new_G) \ No newline at end of file diff --git a/tgx/utils/graph_utils.py b/tgx/utils/graph_utils.py index 25ae851..4264c4b 100644 --- a/tgx/utils/graph_utils.py +++ b/tgx/utils/graph_utils.py @@ -1,286 +1,3 @@ -<<<<<<< HEAD -import numpy as np -from typing import Union, Optional - -__all__ = ["train_test_split", - "discretize_edges", - "subsampling", - "node_list", - "is_discretized", - "frequency_count"] - -SEC_IN_MIN = 60 -SEC_IN_HOUR = 3600 -SEC_IN_DAY = 86400 -SEC_IN_WEEK = 86400 * 7 -SEC_IN_MONTH = 86400 * 30 -SEC_IN_YEAR = 86400 * 365 - -# helper function to do ceiling divison, i.e. 5/2 = 3 -def ceiling_division(n, d): - q, r = divmod(n, d) - return q + bool(r) - - - -def discretize_edges(edgelist: dict, - time_scale: Union[int,str], - store_unix: Optional[bool] = False) -> list: - """ - util function for discretizing edgelist, expected timestamp on edges are unixtimestamp - this func supports discretization of edge timestamp - 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. - 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly", the starting time of the dataset is consider the start of the first interval - Parameters: - edgelist: dict, dictionary of edges - time_scale: int or str, time interval to discretize the graph - store_unix: bool, whether to return the converted timestamps in unix format - Returns: - output list: the first item in the list is always the updated edgelist (dict, dictionary of edges with discretized timestamps) and the second item is the converted timestamps in unix format (list) if store_unix is True - """ - unique_ts = list(edgelist.keys()) - total_time = unique_ts[-1] - unique_ts[0] - if time_scale is not None: - if isinstance(time_scale, int): - interval_size = total_time // time_scale #integer timestamp of the bin, discounting any bin that has a smaller duration than others - elif isinstance(time_scale, str): - if time_scale == "minutely": - interval_size = SEC_IN_MIN - elif time_scale == "hourly": - interval_size = SEC_IN_HOUR - elif time_scale == "daily": - interval_size = SEC_IN_DAY - elif time_scale == "weekly": - interval_size = SEC_IN_WEEK - elif time_scale == "monthly": - interval_size = SEC_IN_MONTH - elif time_scale == "yearly": - interval_size = SEC_IN_YEAR - else: - raise TypeError("Invalid time interval") - else: - raise TypeError("Please provide a time interval") - - num_time_scale = ceiling_division(total_time, interval_size) - print(f'Discretizing data to {num_time_scale} timestamps...') - - updated_edgelist = {} - - if (store_unix): - unix_dict = [] - start_time = int(unique_ts[0]) - for ts, edges_list in edgelist.items(): - bin_ts = ceiling_division(ts, interval_size) #will correctly put edges into the last bin - - for edge in edges_list: - if bin_ts not in updated_edgelist: - updated_edgelist[bin_ts] = [edge] - else: - updated_edgelist[bin_ts].append(edge) - - if (store_unix): - unix_ts = start_time + int(ts // interval_size) * interval_size #round to the nearest start time - unix_ts = int(unix_ts) - unix_dict.extend([unix_ts] * len(edges_list)) - - output = [updated_edgelist] - if (store_unix): - output.append(unix_dict) - return output - - -# def edgelist_discritizer(edgelist: dict, -# time_scale: Union[str, int]): -# """ -# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp -# this func supports discretization in two different ways -# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. -# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" -# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. - -# Parameters: -# edgelist: dict, dictionary of edges -# time_scale: str or int, time interval to discretize the graph -# Returns: -# updated_edgelist: dict, dictionary of edges with discretized timestamps -# """ - -# unique_ts = list(edgelist.keys()) - -# total_time = unique_ts[-1] - unique_ts[0] -# if time_scale is not None: -# if isinstance(time_scale, str): -# if time_scale == "hourly": -# interval_size = SEC_IN_HOUR -# elif time_scale == "daily": -# interval_size = SEC_IN_DAY -# elif time_scale == "weekly": -# interval_size = SEC_IN_WEEK -# elif time_scale == "monthly": -# interval_size = SEC_IN_MONTH -# elif time_scale == "yearly": -# interval_size = SEC_IN_YEAR -# elif isinstance(time_scale, int): -# interval_size = int(total_time / (time_scale)) -# else: -# raise TypeError("Invalid time interval") -# else: -# raise TypeError("Please provide a time interval") -# num_time_scale = int(total_time/interval_size) -# print(f'Discretizing data to {num_time_scale} timestamps...') -# # if num_time_scale == 0: -# # print("Warning! Only one timestamp exist in the data.") - -# updated_edgelist = {} -# for ts, edges_list in edgelist.items(): -# bin_ts = int(ts / interval_size) -# if bin_ts >= num_time_scale: -# bin_ts -= 1 - -# for edge in edges_list: -# if bin_ts not in updated_edgelist: -# updated_edgelist[bin_ts] = [] -# updated_edgelist[bin_ts].append(edge) -# print("Discretization Done..!") -# return updated_edgelist - - - - - - - -def subsampling(graph: Union[object, dict], - node_list: Optional[list] = [], - random_selection: Optional[bool] = False, - N: Optional[int] = 100 - ) -> dict: - """ - Subsampling a part of graph by only monitoring the contacts from specific nodes' list - - Parameters: - graph: graph object or edgelist dict - node_list: list, a set of nodes to extract their contacts from the graph - random_selection: bool, wether randomly subsample a set of nodes from graph - N: int, number of nodes to be randomly sampled from graph - - Returns: - new_edgelist: dict, a dictionary of edges corresponding to nodes in the node_list - """ - print("Generate graph subsample...") - if isinstance(graph, dict): - edgelist = graph - nodes = node_list(graph) - else: - edgelist = graph.edgelist - nodes = graph.nodes() - - if random_selection: - node_list = list(np.random.choice(nodes, size = N, replace = False)) - - new_edgelist = {} - for t, edge_data in edgelist.items(): - for (u,v), f in edge_data.items(): - if u in node_list or v in node_list: - if t not in new_edgelist: - new_edgelist[t] = {} - new_edgelist[t][(u, v)] = f - else: - new_edgelist[t][(u, v)] = f - return new_edgelist - -def frequency_count(edgelist: dict): - new_edgelist = {} - - for t, edges_list in edgelist.items(): - for edge in edges_list: - (u, v) = edge - - # Check if this is the first edge occurning in this timestamp - if t not in new_edgelist: - new_edgelist[t] = {} - new_edgelist[t][(u, v)] = 1 - - else: - if (u, v) not in new_edgelist[t]: - new_edgelist[t][(u, v)] = 1 # If the edge was not occured in this timestamp before - else: - new_edgelist[t][(u, v)] += 1 - - return new_edgelist - -def node_list(dict_edgelist: dict) -> list: - - """ - create a list of nodes from edgelist dictionary - """ - node_list = {} - for _, edge_data in dict_edgelist.items(): - for (u,v), _ in edge_data.items(): - if u not in node_list: - node_list[u] = 1 - if v not in node_list: - node_list[v] = 1 - return list(node_list.keys()) - - -def train_test_split(data : dict, - val : bool = False, - ratio : list = [85, 15]) -> dict: - """ - Generate train/test split for the data - - Parameters: - data:dictionary of data - val: whether we want to have a validation split as well - ratio: list indication the ratio of the data in split. Sum of the list components should be 100. - - Returns: - two (train/test) or three (train/val/test) data dictionaries - """ - sum = 0 - for i in ratio: - sum += i - if sum != 100: - raise ValueError("invalid train/test split ratio. Sum of the ratios should be 100.") - - if val and len(ratio) != 3: - raise Exception("Provide train/val/test ratio") - elif not val and len(ratio) == 3: - print("Warning! Data is being splitted to train and test only!") - - data_len = len(data) - train_split = int(data_len * ratio[0] / 100) - train_data = {k: v for k, v in data.items() if k < train_split} - if val: - val_split = int(data_len * ratio[1] / 100) + train_split - val_data = {k: v for k, v in data.items() if train_split <= k < val_split} - test_data = {k: v for k, v in data.items() if val_split <= k <= data_len} - return train_data, val_data, test_data - - else: - test_data = {k: v for k, v in data.items() if train_split <= k <= data_len} - return train_data, test_data - - -def is_discretized(edgelist: Optional[dict], - max_timestamps: Optional[int] = 10000) -> bool: - r""" - Check if an edgelist is discretized or not. - """ - timestamps = list(edgelist.keys()) - discretized = True - if len(timestamps) > max_timestamps: - discretized = False - - return discretized - -def list2csv(lst: list, - fname: str, - delimiter: str = ",", - fmt: str = '%i'): - out_list = np.array(lst) -======= import numpy as np from typing import Union, Optional @@ -370,68 +87,6 @@ def discretize_edges(edgelist: dict, output.append(unix_dict) return output - -# def edgelist_discritizer(edgelist: dict, -# time_scale: Union[str, int]): -# """ -# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp -# this func supports discretization in two different ways -# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. -# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" -# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. - -# Parameters: -# edgelist: dict, dictionary of edges -# time_scale: str or int, time interval to discretize the graph -# Returns: -# updated_edgelist: dict, dictionary of edges with discretized timestamps -# """ - -# unique_ts = list(edgelist.keys()) - -# total_time = unique_ts[-1] - unique_ts[0] -# if time_scale is not None: -# if isinstance(time_scale, str): -# if time_scale == "hourly": -# interval_size = SEC_IN_HOUR -# elif time_scale == "daily": -# interval_size = SEC_IN_DAY -# elif time_scale == "weekly": -# interval_size = SEC_IN_WEEK -# elif time_scale == "monthly": -# interval_size = SEC_IN_MONTH -# elif time_scale == "yearly": -# interval_size = SEC_IN_YEAR -# elif isinstance(time_scale, int): -# interval_size = int(total_time / (time_scale)) -# else: -# raise TypeError("Invalid time interval") -# else: -# raise TypeError("Please provide a time interval") -# num_time_scale = int(total_time/interval_size) -# print(f'Discretizing data to {num_time_scale} timestamps...') -# # if num_time_scale == 0: -# # print("Warning! Only one timestamp exist in the data.") - -# updated_edgelist = {} -# for ts, edges_list in edgelist.items(): -# bin_ts = int(ts / interval_size) -# if bin_ts >= num_time_scale: -# bin_ts -= 1 - -# for edge in edges_list: -# if bin_ts not in updated_edgelist: -# updated_edgelist[bin_ts] = [] -# updated_edgelist[bin_ts].append(edge) -# print("Discretization Done..!") -# return updated_edgelist - - - - - - - def subsampling(graph: Union[object, dict], node_list: Optional[list] = [], random_selection: Optional[bool] = False, @@ -562,5 +217,62 @@ def list2csv(lst: list, delimiter: str = ",", fmt: str = '%i'): out_list = np.array(lst) ->>>>>>> master - np.savetxt(fname, out_list, delimiter=delimiter, fmt=fmt) \ No newline at end of file + np.savetxt(fname, out_list, delimiter=delimiter, fmt=fmt) + + + + +# def edgelist_discritizer(edgelist: dict, +# time_scale: Union[str, int]): +# """ +# util function for discretizing edgelist, expected timestamp on edges are unixtimestamp +# this func supports discretization in two different ways +# 1. by providing the number of intervals (int), it will equally divide the data into that number of intervals. Note that the last bin can have less duration than others. +# 2. by providing a time granularity (str), it will divide the data into intervals based on the given granularity, i.e. "hourly", "daily", "weekly", "monthly", "yearly" +# In the second way however, the intervals will be based on utc timezone (dividing into days, hours this way) thus both first bin and last bin can have last duration than others. + +# Parameters: +# edgelist: dict, dictionary of edges +# time_scale: str or int, time interval to discretize the graph +# Returns: +# updated_edgelist: dict, dictionary of edges with discretized timestamps +# """ + +# unique_ts = list(edgelist.keys()) + +# total_time = unique_ts[-1] - unique_ts[0] +# if time_scale is not None: +# if isinstance(time_scale, str): +# if time_scale == "hourly": +# interval_size = SEC_IN_HOUR +# elif time_scale == "daily": +# interval_size = SEC_IN_DAY +# elif time_scale == "weekly": +# interval_size = SEC_IN_WEEK +# elif time_scale == "monthly": +# interval_size = SEC_IN_MONTH +# elif time_scale == "yearly": +# interval_size = SEC_IN_YEAR +# elif isinstance(time_scale, int): +# interval_size = int(total_time / (time_scale)) +# else: +# raise TypeError("Invalid time interval") +# else: +# raise TypeError("Please provide a time interval") +# num_time_scale = int(total_time/interval_size) +# print(f'Discretizing data to {num_time_scale} timestamps...') +# # if num_time_scale == 0: +# # print("Warning! Only one timestamp exist in the data.") + +# updated_edgelist = {} +# for ts, edges_list in edgelist.items(): +# bin_ts = int(ts / interval_size) +# if bin_ts >= num_time_scale: +# bin_ts -= 1 + +# for edge in edges_list: +# if bin_ts not in updated_edgelist: +# updated_edgelist[bin_ts] = [] +# updated_edgelist[bin_ts].append(edge) +# print("Discretization Done..!") +# return updated_edgelist diff --git a/tgx/utils/newstat.py b/tgx/utils/newstat.py deleted file mode 100644 index 1df8cae..0000000 --- a/tgx/utils/newstat.py +++ /dev/null @@ -1,163 +0,0 @@ -from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map -import networkx as nx -import numpy as np -from tgx.utils.graph_utils import train_test_split -from typing import List, Dict - -__all__ = ["connected_components_per_ts", - "size_connected_components", - "get_avg_node_engagement", - "degree_density"] - - -def degree_density(graph: tuple, k: int = 10, network_name: str = None, plot_path: str = None) -> None: - r""" - Plot density map of node degrees per time window - Parameters: - graph_edgelist: Dictionary containing graph data - k: number of time windows - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - graph_edgelist = graph.data - degrees_by_k_list = [] - temp = [] - temp_idx = 0 - unique_ts = list(graph_edgelist.keys()) - - for ts in unique_ts: - e_at_this_ts = graph_edgelist[ts] - G = nx.MultiGraph() - - for e in e_at_this_ts: - G.add_edge(e[0], e[1]) - - nodes = G.nodes() - degrees = [G.degree[n] for n in nodes] - - if temp_idx None: - r""" - Plot number of connected components per timestamp - Parameters: - graph: a list containing graph snapshots - network_name: name of the graph to be used in the output file name - plot_path: path to save the output figure - """ - num_components = [] - for t in range(len(graph.data)): - edgelist_t = graph.data[t] - nodes_t = graph.edgelist_node_list(edgelist_t) - parent = {node: node for node in nodes_t} - - for edge in edgelist_t: - (u, v) = edge - _merge(u, v, parent) - - num = 0 - for u in nodes_t: - if parent[u] == u: - num += 1 - num_components.append(num) - - if network_name is not None: - filename = f"{network_name}_connected_components_per_ts" - else: - filename = "_connected_components_per_ts" - - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - return - - -def size_connected_components(graph: tuple) -> List[List]: - r""" - Calculate the sizes of connected components per timestamp - Returns: - list[list]: A list containing lists of sizes of connected components for each timestamp. - """ - component_sizes = [] - for t in range(len(graph.data)): - edgelist_t = graph.data[t] - nodes_t = graph.edgelist_node_list(edgelist_t) - parent = {node: node for node in nodes_t} - - for edge in edgelist_t: - (u, v) = edge - _merge(u, v, parent) - - component_sizes_t = {} - for u in nodes_t: - root = _find(u, parent) - if root not in component_sizes_t: - component_sizes_t[root] = 0 - component_sizes_t[root] += 1 - - component_sizes_t_list = list(component_sizes_t.values()) - component_sizes.append(component_sizes_t_list) - - return component_sizes - - -def get_avg_node_engagement(graph: tuple) -> List[int]: - r""" - Calculate the average node engagement per timestamp, - the average number of distinct nodes that establish - at least one new connection. - Parameters: - graph_edgelist: Dictionary containing graph data - """ - engaging_nodes = [] - previous_edges = set() - - for ts in range(len(graph.data)): - edgelist_t = graph.data[ts] - new_nodes = set() - - for edge in edgelist_t: - (u, v) = edge - if frozenset({u, v}) not in previous_edges: - if u not in new_nodes: - new_nodes.add(u) - if v not in new_nodes: - new_nodes.add(v) - - engaging_nodes.append(len(new_nodes)) - previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp - - return engaging_nodes \ No newline at end of file diff --git a/tgx/utils/plotting_utils.py b/tgx/utils/plotting_utils.py index 3c3efad..f23df2c 100644 --- a/tgx/utils/plotting_utils.py +++ b/tgx/utils/plotting_utils.py @@ -28,8 +28,7 @@ def create_ts_list(start, end, metric=None, interval=None): def plot_nodes_edges_per_ts(edges: list, nodes: list, ts: list, - network_name: str, - plot_path: str = None, + filename: str = None, ylabel_1: str = 'Edges per Timestamp', ylabel_2: str = 'Nodes per Timestamp'): """ @@ -38,8 +37,7 @@ def plot_nodes_edges_per_ts(edges: list, edges: A list containing number of edges per timestamp nodes: A list containing number of nodes per timestamp ts: list of timestamps - network_name: Name of the network to be used in the output file name - plot_path: Path to save the output figure + filename: Name of the output file name, containing the path ylabel_1: Label for the edges per timestamp line ylabel_2: Label for the nodes per timestamp line """ @@ -59,10 +57,10 @@ def plot_nodes_edges_per_ts(edges: list, ax1.set_ylim(0) ax2.set_ylim(0) ax1.set_xlim(0, len(ts)-1) - if plot_path is not None: - filename = f"{network_name}_node&edge_per_ts" - plt.savefig(f'{plot_path}/{filename}') - plt.show() + if filename is not None: + plt.savefig(f'{filename}') + else: + plt.show() def plot_for_snapshots(data: list, y_title: str, @@ -72,10 +70,9 @@ def plot_for_snapshots(data: list, Plot a variable for different timestamps Parameters: data: A list of desired variable to be plotted - filename: Name of the output file name y_title: Title of the y axis + filename: Name of the output file name, containing the path show_ave: Whether to plot a line showing the average of the variable over all timestamps - plot_path: The path to save the output file ''' ts = list(range(0, len(data))) # plt.rcParams["font.family"] = "Times New Roman" @@ -96,9 +93,15 @@ def plot_for_snapshots(data: list, plt.show() -def plot_density_map(data, filename, y_title, plot_path=None): +def plot_density_map(data: list, + y_title: str, + filename: str = None,): ''' Plot a density map using fig and ax + Parameters: + data: A list of desired variable to be plotted + y_title: Title of the y axis + filename: Name of the output file name, containing the path ''' max_value = max(max(inner) for inner in data if inner) c = np.zeros((max_value, len(data))) @@ -125,9 +128,10 @@ def plot_density_map(data, filename, y_title, plot_path=None): # Adjust the aspect ratio of the plot ax.set_aspect('auto') - if plot_path is not None: - plt.savefig(f'{plot_path}/{filename}') - plt.show() + if filename is not None: + plt.savefig(f'{filename}') + else: + plt.show() if __name__ == "__main__": create_ts_list(86400, 86400*365, "unix", "month") diff --git a/tgx/utils/stat.py b/tgx/utils/stat.py index a2efa85..b1dc5e4 100644 --- a/tgx/utils/stat.py +++ b/tgx/utils/stat.py @@ -1,7 +1,7 @@ from tgx.utils.plotting_utils import plot_for_snapshots, plot_nodes_edges_per_ts, plot_density_map import networkx as nx import numpy as np -from tgx.utils.graph_utils import train_test_split +from typing import List __all__ = ["degree_over_time", "nodes_over_time", @@ -15,6 +15,8 @@ "get_surprise", "get_novelty", "get_avg_node_activity", + "connected_components_per_ts", + "size_connected_components", "get_avg_node_engagement", "degree_density"] @@ -34,33 +36,30 @@ def _merge(x, y, parent): parent[root_x] = root_y - def degree_over_time(graph: object, network_name: str, - filepath: str = ".") -> None: + filepath: str = "./") -> None: r''' Plot average degree per timestamp. Parameters: graph: Graph object created by tgx.Graph containing edgelist - total_nodes: number of nodes that appear through all the snapshots network_name: name of the graph to be used in the output file name filepath: path to save the output figure ''' - print("Plotting average degree per timestamp") ave_degree = _calculate_average_degree_per_ts(graph) if network_name is not None: filename = f"{network_name}_ave_degree_per_ts" else: filename = "ave_degree_per_ts" - plot_for_snapshots(ave_degree, filename, "Average degree", plot_path = filepath) + plot_for_snapshots(ave_degree, y_title= "Average degree", filename=filepath+filename) return def nodes_over_time(graph: object, network_name: str, - filepath: str = ".") -> None: + filepath: str = "./") -> None: r''' Plot number of active nodes per timestamp. @@ -69,19 +68,17 @@ def nodes_over_time(graph: object, network_name: name of the graph to be used in the output file name filepath: path to save the output figure ''' - print("Plotting number of nodes per timestamp.") active_nodes = _calculate_node_per_ts(graph) if network_name is not None: filename = f"{network_name}_nodes_per_ts" else: filename = "nodes_per_ts" - plot_for_snapshots(active_nodes, filename, "Number of nodes", plot_path = filepath) + plot_for_snapshots(active_nodes, y_title="Number of nodes", filename=filepath+filename) return def edges_over_time(graph: object, - plot_path: str = None, network_name: str = None, - filepath: str = ".") -> None: + filepath: str = "./") -> None: r''' Plot number of edges per timestamp. Parameters: @@ -89,18 +86,17 @@ def edges_over_time(graph: object, network_name: name of the graph to be used in the output file name filepath: path to save the output figure ''' - print("Plotting number of edges per timestamp.") active_edges = _calculate_edge_per_ts(graph) if network_name is not None: filename = f"{network_name}_edges_per_ts" else: filename = "_edges_per_ts" - plot_for_snapshots(active_edges, plot_path, filename, "Number of edges", plot_path = filepath) + plot_for_snapshots(active_edges, y_title="Number of edges", filename=filepath+filename) return def nodes_and_edges_over_time(graph: object, network_name: str , - filepath: str = "."): + filepath: str = "./"): r""" Plot number of nodes per timestamp and number of edges per timestamp in one fiugre. Parameters: @@ -112,9 +108,11 @@ def nodes_and_edges_over_time(graph: object, edges = _calculate_edge_per_ts(graph) nodes = _calculate_node_per_ts(graph) ts = list(range(0, len(graph.data))) - - - return plot_nodes_edges_per_ts(edges, nodes, ts, network_name = network_name, plot_path = filepath) + if network_name is not None: + filename = f"{network_name}_node_and_edges_per_ts" + else: + filename = "node_and_edges_per_ts" + return plot_nodes_edges_per_ts(edges, nodes, ts, filename=filepath+filename) @@ -377,10 +375,17 @@ def get_avg_node_engagement(graph: object): previous_edges = {frozenset({u, v}) for (u, v) in e_list} # Update the set of previous edges for the next timestamp return engaging_nodes - -def degree_density(graph: object, network_name: str = None, k = 10, plot_path: str = None) -> None: +def degree_density(graph: tuple, + k: int = 10, + network_name: str = None, + plot_path: str = "./") -> None: r""" - plot the density map of node degrees over timestamps + Plot density map of node degrees per time window + Parameters: + graph_edgelist: Dictionary containing graph data + k: number of time windows + network_name: name of the graph to be used in the output file name + plot_path: path to save the output figure """ graph_edgelist = graph.data degrees_by_k_list = [] @@ -414,13 +419,13 @@ def degree_density(graph: object, network_name: str = None, k = 10, plot_path: s else: filename = "_degree_density" - plot_density_map(degrees_by_k_list, filename, "Node Degree", plot_path = plot_path) + plot_density_map(degrees_by_k_list, y_title="Node Degree", filename = plot_path + filename) return def connected_components_per_ts(graph: tuple, network_name: str = None, - plot_path: str = None) -> None: + plot_path: str = "./") -> None: r""" Plot number of connected components per timestamp Parameters: @@ -449,5 +454,64 @@ def connected_components_per_ts(graph: tuple, else: filename = "_connected_components_per_ts" - plot_for_snapshots(num_components, filename, "Number of connected components", plot_path = plot_path) - return \ No newline at end of file + plot_for_snapshots(num_components, y_title="Number of connected components", filename=plot_path+filename) + return + + +# TODO turn this into a plotting function as well, can return the computed stats +def size_connected_components(graph: tuple) -> List[List]: + r""" + Calculate the sizes of connected components per timestamp + Returns: + list[list]: A list containing lists of sizes of connected components for each timestamp. + """ + component_sizes = [] + for t in range(len(graph.data)): + edgelist_t = graph.data[t] + nodes_t = graph.edgelist_node_list(edgelist_t) + parent = {node: node for node in nodes_t} + + for edge in edgelist_t: + (u, v) = edge + _merge(u, v, parent) + + component_sizes_t = {} + for u in nodes_t: + root = _find(u, parent) + if root not in component_sizes_t: + component_sizes_t[root] = 0 + component_sizes_t[root] += 1 + + component_sizes_t_list = list(component_sizes_t.values()) + component_sizes.append(component_sizes_t_list) + + return component_sizes + +# TODO turn this into a plotting function as well, can return the computed stats +def get_avg_node_engagement(graph: tuple) -> List[int]: + r""" + Calculate the average node engagement per timestamp, + the average number of distinct nodes that establish + at least one new connection. + Parameters: + graph_edgelist: Dictionary containing graph data + """ + engaging_nodes = [] + previous_edges = set() + + for ts in range(len(graph.data)): + edgelist_t = graph.data[ts] + new_nodes = set() + + for edge in edgelist_t: + (u, v) = edge + if frozenset({u, v}) not in previous_edges: + if u not in new_nodes: + new_nodes.add(u) + if v not in new_nodes: + new_nodes.add(v) + + engaging_nodes.append(len(new_nodes)) + previous_edges = {frozenset({u, v}) for (u, v) in edgelist_t} # Update the set of previous edges for next timestamp + + return engaging_nodes \ No newline at end of file diff --git a/tgx/viz/TEA.py b/tgx/viz/TEA.py index 57b130c..e684f4d 100644 --- a/tgx/viz/TEA.py +++ b/tgx/viz/TEA.py @@ -1,224 +1,3 @@ -<<<<<<< HEAD -import pandas as pd -import matplotlib.pyplot as plt -from typing import Union, Optional -from tgx.utils.graph_utils import discretize_edges -from tgx.utils.plotting_utils import create_ts_list -__all__ = ["TEA"] - -def TEA( - temp_edgelist : Union[object, dict], - filepath : Optional[str] = ".", - fig_size : tuple = (7,5), - font_size : int = 20, - network_name : str = None, - time_scale : Union[str, int] = None, - real_dates : bool = None, - test_split : bool = False, - density : bool = False - ): - r""" - generating TEA plot - - Parameters: - temp_edgelist: a dictionary of temporal edges or a dataset object. - filepath: Path to save the TEA Plot. - fig_size: Size of the figure to save. - font_size: Size of the text in the figure. - network_name: Name of the dataset to be used in the TEA plot file. - time_scale: time_scale for discretizing data if already not done. - real_dates: Whether to use the real dates from dataset. - test_split: Whether show the test split on the plot. - density: Whether to return edge density and edge frequency dictioneries. - """ - if isinstance(temp_edgelist, object): - if temp_edgelist.freq_data is None: - temp_edgelist.count_freq() - temp_edgelist = temp_edgelist.freq_data - - # check number of unique timestamps: - unique_ts = list(temp_edgelist.keys()) - # if len(unique_ts) > max_time_scale: - # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() - # if inp == "y": - # temp_edgelist = edgelist_discritizer(temp_edgelist, - # unique_ts, - # time_scale = max_time_scale) - if time_scale is not None: - temp_edgelist = discretize_edges(temp_edgelist, - time_scale = time_scale) - - - ts_edges_dist, ts_edges_dist_density, edge_frequency_dict = TEA_process_edgelist_per_timestamp(temp_edgelist) - - TEA_plot_edges_bar(ts_edges_dist, - filepath = filepath, - fig_size = fig_size, - font_size = font_size, - network_name=network_name, - real_dates = real_dates, - test_split = test_split) - - if density: - return ts_edges_dist_density, edge_frequency_dict - - - -def TEA_process_edgelist_per_timestamp(temp_edgelist): - # generate distribution of the edges history - unique_ts = list(temp_edgelist.keys()) - # unique_ts.sort() - # print(f"There are {len(unique_ts)} timestamps.") - - # get node set & total number of nodes - node_dict = {} - for t, e_dict in temp_edgelist.items(): - for e, exist in e_dict.items(): - if e[0] not in node_dict: - node_dict[e[0]] = 1 - if e[1] not in node_dict: - node_dict[e[1]] = 1 - num_nodes = len(node_dict) - num_e_fully_connected = num_nodes * (num_nodes - 1) - - edge_frequency_dict = {} # how many times an edge is seen - ts_edges_dist = [] # contains different features specifying the characteristics of the edge distribution over time - ts_edges_dist_density = [] - for curr_t in unique_ts: - - # if curr_t < 2: - # print("curr_t", curr_t) - prev_ts = [ts for ts in unique_ts if ts < curr_t] - edges_in_prev_ts = {} - for bts in prev_ts: - edges_in_prev_ts.update(temp_edgelist[bts]) - - curr_ts_edge_list = temp_edgelist[curr_t] - for e in curr_ts_edge_list: - if e not in edge_frequency_dict: - edge_frequency_dict[e] = 1 - else: - edge_frequency_dict[e] += 1 - - if len(curr_ts_edge_list) > 0: - curr_ts_edges_dist = {'ts': curr_t, - 'new': len([e for e in curr_ts_edge_list if e not in edges_in_prev_ts]), - 'repeated': len([e for e in curr_ts_edge_list if e in edges_in_prev_ts]), - 'not_repeated': len([e for e in edges_in_prev_ts if e not in curr_ts_edge_list]), - 'total_curr_ts': len(curr_ts_edge_list), - 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) - } - curr_ts_edges_dist_density = {'ts': curr_t, - 'new': (curr_ts_edges_dist['new'] * 1.0) / num_e_fully_connected, - 'repeated': (curr_ts_edges_dist['repeated'] * 1.0) / num_e_fully_connected, - 'not_repeated': (curr_ts_edges_dist[ - 'not_repeated'] * 1.0) / num_e_fully_connected, - 'total_curr_ts': (curr_ts_edges_dist[ - 'total_curr_ts'] * 1.0) / num_e_fully_connected, - 'total_seen_until_curr_ts': (curr_ts_edges_dist[ - 'total_seen_until_curr_ts'] * 1.0) / num_e_fully_connected, - } - else: - curr_ts_edges_dist = {'ts': curr_t, - 'new': 0, - 'repeated': 0, - 'not_repeated': 0, - 'total_curr_ts': 0, - 'total_seen_until_curr_ts': len(edges_in_prev_ts) + len(curr_ts_edge_list) - } - curr_ts_edges_dist_density = {'ts': curr_t, - 'new': 0, - 'repeated': 0, - 'not_repeated': 0, - 'total_curr_ts': 0, - 'total_seen_until_curr_ts': 0, - } - ts_edges_dist.append(curr_ts_edges_dist) - ts_edges_dist_density.append(curr_ts_edges_dist_density) - # print(len(edges_in_prev_ts)) - # print(len(ts_edges_dist)) - # print(edge_frequency_dict) - # break - return ts_edges_dist, ts_edges_dist_density, edge_frequency_dict - - -def TEA_plot_edges_bar(ts_edges_dist: list, - filepath: str = ".", - fig_size: list = (9,5), - font_size: int = 20, - network_name: str = None, - real_dates: list = None, - time_scale: list = None, - test_split: bool = False, - show: bool =False): - r""" - Making TEA plot and save into pdf file. - Args: - ts_edges_dist: list of dictionaries containing the edge distribution over time. - filepath: Path to save the TEA Plot. - fig_size: Size of the figure to save. - font_size: Size of the text in the figure. - network_name: Name of the dataset to be used in the TEA plot file. - real_dates: list of real dates as ticks - time_scale: time_scale for discretizing data if already not done. - test_split: Whether show the test split on the plot. - show: Whether to show the plot. - """ - - - ts_edges_dist_df = pd.DataFrame(ts_edges_dist, columns=['ts', 'new', 'repeated', - 'not_repeated', - 'total_curr_ts', - 'total_seen_until_curr_ts']) - - - ### Additional Stats ### - mean = ts_edges_dist_df.mean(axis=0) - # print("INFO: Network Name:", network_name) - # print("INFO: AVG. stats. over all timestamps: ", mean) - # print("INFO: ratio of avg.(new)/avg.(total_curr_ts): {:.2f}".format(mean['new'] / mean['total_curr_ts'])) - ### - - fig, ax = plt.subplots(figsize=fig_size) # lastfm, mooc, reddit, UNtrade, UNvote - plt.subplots_adjust(bottom=0.2, left=0.2) - font_size = font_size - ticks_font_size = 15 - plt.yticks(fontsize=ticks_font_size) - plt.xticks(fontsize=ticks_font_size) - if real_dates is not None: - start = real_dates[0] - end = real_dates[1] - metric = real_dates[2] - create_ts_list(start, end, metric=metric, interval=time_scale) - else: - duration = ts_edges_dist_df['ts'].tolist() - timestamps = [i for i in range(len(duration))] - - new = ts_edges_dist_df['new'].tolist() - repeated = ts_edges_dist_df['repeated'].tolist() - # print(len(timestamps), repeated, new) - # plotting stuffs - # bar plot - plt.bar(timestamps, repeated, label='Repeated', color='#404040', alpha=0.4) - plt.bar(timestamps, new, label='New', bottom=repeated, color='#ca0020', alpha=0.8, hatch='//') - # test split line - if test_split: - plt.axvline(x=(timestamps[int(0.85 * len(timestamps))]), color="blue", linestyle="--", linewidth=2) - plt.text((timestamps[int(0.85 * len(timestamps))]), 0, - 'x', va='center', ha='center', fontsize=font_size, fontweight='heavy', color='blue') - - plt.margins(x=0) - plt.xlabel("Timestamp", fontsize=font_size) - plt.ylabel("Number of edges", fontsize=font_size) - plt.legend(fontsize = 13) - if filepath is not None: - plt.savefig(f"{filepath}/{network_name}_TEA.pdf") - print("plot saved as " + f"{filepath}/{network_name}_TEA.pdf") - if (show): - plt.show() - - -======= import pandas as pd import matplotlib.pyplot as plt from typing import Union, Optional @@ -438,4 +217,3 @@ def TEA_plot_edges_bar(ts_edges_dist: list, plt.show() ->>>>>>> master diff --git a/tgx/viz/TET.py b/tgx/viz/TET.py index 15cbda7..52bb2a9 100644 --- a/tgx/viz/TET.py +++ b/tgx/viz/TET.py @@ -1,340 +1,3 @@ -<<<<<<< HEAD -# TET Plot -import numpy as np -import pandas as pd -import seaborn as sns -from tqdm import tqdm -from typing import Union, Optional -import matplotlib.pyplot as plt -from tgx.utils.graph_utils import discretize_edges - - -# some parameters to be used for drawing -E_ABSENT = 0 -E_PRESENCE_GENERAL = 1 -E_SEEN_IN_TRAIN = 2 -E_IN_TEST = 3 -E_NOT_IN_TEST = 4 - -TEST_RATIO = 0.15 - -# new color controlling parameters; Date: Dec. 22, 2021 -E_ONLY_TRAIN = 10 -E_TRAIN_AND_TEST = 20 -E_TRANSDUCTIVE = 30 -E_INDUCTIVE = 40 - - -#! should be merged graph class? -def TET(temp_edgelist : Union[object, dict], - filepath: Optional[str] = ".", - time_scale : Union[str, int] = None, - network_name : str = None, - add_frame : bool = True, - test_split : bool = False, - figsize : tuple = (9, 5), - axis_title_font_size : int = 20, - ticks_font_size : int = 20, - show: bool = True): - r""" - Generate TET plots - Args: - temp_edgelist: a dictionary of temporal edges or a dataset object. - filepath: Path to save the TEA Plot. - figsize: Size of the figure to save. - axis_title_font_size: The font size of xis titles. - ticks_font_size: Size of the text in the figure. - add_frame: Add the frame to the plot. - network_name: Name of the dataset to be used in the TEA plot file. - time_scale: time_scale for discretizing data if already not done. - test_split: Whether show the test split on the plot. - max_time_scale: Maximum number of time_scale to discretize data. - show: Whether to show the plot. - """ - if isinstance(temp_edgelist, object): - if temp_edgelist.freq_data is None: - temp_edgelist.count_freq() - temp_edgelist = temp_edgelist.freq_data - - # check number of unique timestamps: - unique_ts = list(temp_edgelist.keys()) - # if len(unique_ts) > max_time_scale: - # inp = input(f"There are {unique_ts} timestamps in the data.\nDo you want to discretize the data to 1000 timestamps?(y/n)").lower() - # if inp == "y": - # temp_edgelist = edgelist_discritizer(temp_edgelist, - # unique_ts, - # time_scale = max_time_scale) - if time_scale is not None: - temp_edgelist = discretize_edges(temp_edgelist, - time_scale = time_scale) - - edge_last_ts = generate_edge_last_timestamp(temp_edgelist) - edge_idx_map = generate_edge_idx_map(temp_edgelist, edge_last_ts) - idx_edge_map = {v: k for k, v in edge_idx_map.items()} # key: edge index; value: actual edge (source, destination) - print("Info: Number of distinct edges (from index-edge map): {}".format(len(idx_edge_map))) - - unique_ts_list = list(temp_edgelist.keys()) - e_presence_mat = generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, temp_edgelist) - print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) - # print(np.unique(e_presence_mat, return_counts=True)) - e_presence_mat, test_split_ts_value = process_presence_matrix(e_presence_mat, test_ratio_p=0.85) - print("Info: edge-presence-matrix shape: {}".format(e_presence_mat.shape)) - # print(np.unique(e_presence_mat, return_counts=True)) - fig_param = set_fig_param(network_name, - fig_name = filepath, - figsize = figsize, - axis_title_font_size = axis_title_font_size, - ticks_font_size = ticks_font_size) - - plot_edge_presence_matrix(e_presence_mat, test_split_ts_value, unique_ts_list, list(idx_edge_map.keys()), - fig_param, test_split = test_split, add_frames=add_frame, show=show) - return - - -def generate_edge_last_timestamp(edges_per_ts): - """generates a dictionary containing the last timestamp of each edge""" - edge_last_ts = {} - for ts, e_list in edges_per_ts.items(): - for e in e_list: - if e not in edge_last_ts: - edge_last_ts[e] = ts - else: - edge_last_ts[e] = max(ts, edge_last_ts[e]) - return edge_last_ts - - -def generate_edge_idx_map(edges_per_ts, edge_last_ts): - """ - generates index for edges according to two-level sorting policy: - 1. the first level is based on their first appearance timestamp - 2. the second level is based on their last appearance timestamp - """ - edge_idx_map = {} # key: actual edge (source, destination), value: edge index - distinct_edge_idx = 0 - for ts, ts_e_list in edges_per_ts.items(): - e_last_ts_this_timestamp = {} - for e in ts_e_list: - e_last_ts_this_timestamp[e] = edge_last_ts[e] - e_last_ts_this_timestamp = dict(sorted(e_last_ts_this_timestamp.items(), key=lambda item: item[1])) - for e in e_last_ts_this_timestamp: - if e not in edge_idx_map: - edge_idx_map[e] = distinct_edge_idx - distinct_edge_idx += 1 - - return edge_idx_map - - -def generate_edge_presence_matrix(unique_ts_list, idx_edge_map, edge_idx_map, edges_per_ts): - ''' - Returns presence matrix with values 0 and 1 which indicate: - value = 0 : edge is not present in this timestamp - value = 1 : edge is present in this timestamp - - shape: (ts, total number of edges) - ''' - num_unique_ts = len(unique_ts_list) - num_unique_edge = len(idx_edge_map) - e_presence_mat = np.zeros([num_unique_ts, num_unique_edge], dtype=np.int8) - unique_ts_list = np.sort(unique_ts_list) - - for x, ts in tqdm(enumerate(unique_ts_list)): - es_ts = edges_per_ts[ts] - for e in es_ts: - e_presence_mat[num_unique_ts - x - 1, edge_idx_map[e]] = E_PRESENCE_GENERAL - - return e_presence_mat - -def process_presence_matrix(e_presence_matrix, test_ratio_p): - """ - there are 4 types of edge presence: - 1. only in train - 2. in train and in test - 3. in test and train (which is the number 2 but in later timestamps) - 4. only in test - X: timestamp - Y: edge index - """ - num_unique_ts = e_presence_matrix.shape[0] - num_unique_edges = e_presence_matrix.shape[1] - ts_idx_list = [i for i in range(num_unique_ts)] - - # generating timestamp list for train and test: - test_split_ts_value = int(np.quantile(ts_idx_list, test_ratio_p)) - train_ts_list = [ts for ts in ts_idx_list if ts <= test_split_ts_value] # any timestamp in train/validation split - test_ts_list = [ts for ts in ts_idx_list if ts > test_split_ts_value] # test_split_ts_value is in train - - # first level processing: differentiate train set edges: 1) Only in train set, 2) in train & test set - print("First level processing: ") - print("Detecting edges present in train & test sets") - for tr_ts in tqdm(train_ts_list): - for eidx in range(num_unique_edges): - if e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] == E_PRESENCE_GENERAL: - for test_ts_idx in range(test_split_ts_value + 1, num_unique_ts): - if e_presence_matrix[num_unique_ts - test_ts_idx - 1, eidx] == E_PRESENCE_GENERAL: # if seen in - # the test set - e_presence_matrix[num_unique_ts - tr_ts - 1, eidx] = E_TRAIN_AND_TEST - break - - # differentiate test set edges: 1) transductive (seen in train, repeating in test), 2) inductive (only in test) - print("Detecting transductive edges (seen in train, repeating in test)") - for ts in tqdm(test_ts_list): - for eidx in range(num_unique_edges): - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - for prev_ts_idx in range(test_split_ts_value, -1, -1): - if e_presence_matrix[num_unique_ts - prev_ts_idx - 1, eidx] == E_TRAIN_AND_TEST: # if seen in - # the training set - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_TRANSDUCTIVE - break - - # second level processing - print("Second level processing:") - print("Detecting edges 1) Only in train set, 2) only in test (inductive)") - for ts in tqdm(range(num_unique_ts)): - for eidx in range(num_unique_edges): - if ts <= test_split_ts_value: - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_ONLY_TRAIN - else: - if e_presence_matrix[num_unique_ts - ts - 1, eidx] == E_PRESENCE_GENERAL: - e_presence_matrix[num_unique_ts - ts - 1, eidx] = E_INDUCTIVE - - return e_presence_matrix, test_split_ts_value - - -def plot_edge_presence_matrix(e_presence_mat, - test_split_ts_value, - unique_ts_list, - idx_edge_list, - fig_param, - test_split = False, - add_frames=True, - show=False): - print("Info: plotting edge presence heatmap for {} ...".format(fig_param.fig_name)) - - fig, ax = plt.subplots(figsize=fig_param.figsize) - plt.subplots_adjust(bottom=0.3, left=0.2) - - # colors = ['white', # E_ABSENCE - # '#67a9cf', # E_ONLY_TRAIN - # '#ef8a62', # E_TRAIN_AND_TEST - # '#ef8a62', # E_TRANSDUCTIVE - # '#b2182b' # E_INDUCTIVE - # ] - if test_split: - colors = ['white', # E_ABSENCE - '#018571', # E_ONLY_TRAIN 2c7bb6 - '#fc8d59', # E_TRAIN_AND_TEST - '#fc8d59', # E_TRANSDUCTIVE - '#b2182b' # E_INDUCTIVE - ] - else: - colors = ['white', - '#ca0020', - '#ca0020', - '#ca0020', - '#ca0020',] - # print(sns.color_palette(colors, as_cmap=True)) - frame_color = "grey" # "#bababa" - time_split_color = "black" - axis_title_font_size = fig_param.axis_title_font_size - x_font_size = fig_param.ticks_font_size - y_font_size = fig_param.ticks_font_size - - ax = sns.heatmap(e_presence_mat, cmap=sns.color_palette(colors, as_cmap=True), cbar=False) - - # processing x-axis - x_gaps = np.linspace(0, len((idx_edge_list)), num=5) - x_labels = x_gaps / len(idx_edge_list) - x_labels = [int(100*x) for x in x_labels] - plt.xticks(x_gaps, x_labels, rotation=0, fontsize=x_font_size) - - # processing y-axis - t_gaps = np.linspace(0, len(unique_ts_list), num=5) - t_labels = [int(len(unique_ts_list) - tidx) for tidx in t_gaps] - plt.yticks(t_gaps, t_labels, rotation=90, fontsize=y_font_size) - - # axis & title - # plt.margins(x=0) - plt.xlabel("Percentage of observed edges", fontsize=axis_title_font_size) - plt.ylabel("Timestamp", fontsize=axis_title_font_size) - - # requirements for additional features - x_length = e_presence_mat.shape[1] - 1 - y_length = e_presence_mat.shape[0] - 1 - test_split_idx_value = y_length - test_split_ts_value - e_border_idx = 0 - for e_idx in range(e_presence_mat.shape[1] - 1, -1, -1): - if e_presence_mat[y_length - test_split_ts_value, e_idx] != E_ABSENT: - e_border_idx = e_idx - break - - # rectangle for different parts of the dataset - if add_frames and test_split: - print("Info: Border edge index:", e_border_idx) - print("Info: Test split timestamp value:", test_split_ts_value) - rect_train = plt.Rectangle((0, y_length - test_split_ts_value + 0.085), e_border_idx, test_split_ts_value + 0.9, - fill=False, linewidth=2, edgecolor=frame_color) - rect_test_mayseen = plt.Rectangle((0, 0), e_border_idx, y_length - test_split_ts_value - 0.1, - fill=False, linewidth=2, edgecolor=frame_color) - rect_test_new = plt.Rectangle((e_border_idx, 0), x_length - e_border_idx, - y_length - test_split_ts_value - 0.1, - fill=False, linewidth=2, edgecolor=frame_color) - ax = ax or plt.gca() - ax.add_patch(rect_train) - ax.add_patch(rect_test_mayseen) - ax.add_patch(rect_test_new) - - elif add_frames: - ax.add_patch(plt.Rectangle((0, 0), x_length, y_length+1, - fill=False, linewidth=2, edgecolor=frame_color)) - # test split horizontal line - if test_split: - plt.axhline(y=test_split_idx_value, color=time_split_color, linestyle="--", linewidth=2, label='x') - plt.text(x=0, y=test_split_idx_value, s='x', color=time_split_color, va='center', ha='center', - fontsize=y_font_size, fontweight='heavy') - - if fig_param.fig_name is not None: - # print("Info: file name: {}".format(fig_param.fig_name)) - plt.savefig(f"{fig_param.fig_name}/{fig_param.network_name}_TET.pdf") - plt.show() - print("Info: plotting done!") - -def set_fig_param(network_name, fig_name = None, - figsize = (9, 5), - axis_title_font_size = 20, - ticks_font_size = 22, - axis_tick_gap = 20, - timestamp_split_cross_mark_offset = 1): - - # if network_name in ['US Legislative', 'Canadian Vote', 'UN Trade', 'UN Vote']: - # axis_tick_gap = axis_tick_gap * 0.35 - - # elif network_name in ['Reddit', 'Wikipedia', 'UCI', 'Social Evo.', 'Flights', 'LastFM', 'MOOC']: - # axis_tick_gap = axis_tick_gap * 0.5 - - # elif network_name in ['Enron']: - # axis_tick_gap = axis_tick_gap * 0.4 - - fig_param = Fig_Param(network_name, - fig_name, - figsize, - axis_title_font_size, - ticks_font_size, - axis_tick_gap, - timestamp_split_cross_mark_offset) - - return fig_param - -class Fig_Param: - def __init__(self, network_name, fig_name, figsize, axis_title_font_size, ticks_font_size, axis_tick_gap, - timestamp_split_cross_mark_offset): - self.network_name = network_name - self.fig_name = fig_name - self.figsize = figsize - self.axis_title_font_size = axis_title_font_size - self.ticks_font_size = ticks_font_size - self.axis_tick_gap = axis_tick_gap -======= # TET Plot import numpy as np import pandas as pd @@ -670,5 +333,4 @@ def __init__(self, network_name, fig_name, figsize, axis_title_font_size, ticks_ self.axis_title_font_size = axis_title_font_size self.ticks_font_size = ticks_font_size self.axis_tick_gap = axis_tick_gap ->>>>>>> master self.timestamp_split_cross_mark_offset = timestamp_split_cross_mark_offset \ No newline at end of file