-
Notifications
You must be signed in to change notification settings - Fork 20
/
data_preprocess.py
executable file
·191 lines (158 loc) · 6.43 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
Adapted from the CPF implementation
https://github.com/BUPT-GAMMA/CPF/tree/389c01aaf238689ee7b1e5aba127842341e123b6/data
"""
import numpy as np
import scipy.sparse as sp
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
def is_binary_bag_of_words(features):
features_coo = features.tocoo()
return all(
single_entry == 1.0
for _, _, single_entry in zip(
features_coo.row, features_coo.col, features_coo.data
)
)
def to_binary_bag_of_words(features):
"""Converts TF/IDF features to binary bag-of-words features."""
features_copy = features.tocsr()
features_copy.data[:] = 1.0
return features_copy
def normalize(mx):
"""Row-normalize sparse matrix"""
rowsum = np.array(mx.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.0
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return mx
def normalize_adj(adj):
adj = normalize(adj + sp.eye(adj.shape[0]))
return adj
def eliminate_self_loops_adj(A):
"""Remove self-loops from the adjacency matrix."""
A = A.tolil()
A.setdiag(0)
A = A.tocsr()
A.eliminate_zeros()
return A
def largest_connected_components(sparse_graph, n_components=1):
"""Select the largest connected components in the graph.
Parameters
----------
sparse_graph : SparseGraph
Input graph.
n_components : int, default 1
Number of largest connected components to keep.
Returns
-------
sparse_graph : SparseGraph
Subgraph of the input graph where only the nodes in largest n_components are kept.
"""
_, component_indices = sp.csgraph.connected_components(sparse_graph.adj_matrix)
component_sizes = np.bincount(component_indices)
components_to_keep = np.argsort(component_sizes)[::-1][
:n_components
] # reverse order to sort descending
nodes_to_keep = [
idx
for (idx, component) in enumerate(component_indices)
if component in components_to_keep
]
return create_subgraph(sparse_graph, nodes_to_keep=nodes_to_keep)
def create_subgraph(
sparse_graph, _sentinel=None, nodes_to_remove=None, nodes_to_keep=None
):
"""Create a graph with the specified subset of nodes.
Exactly one of (nodes_to_remove, nodes_to_keep) should be provided, while the other stays None.
Note that to avoid confusion, it is required to pass node indices as named arguments to this function.
Parameters
----------
sparse_graph : SparseGraph
Input graph.
_sentinel : None
Internal, to prevent passing positional arguments. Do not use.
nodes_to_remove : array-like of int
Indices of nodes that have to removed.
nodes_to_keep : array-like of int
Indices of nodes that have to be kept.
Returns
-------
sparse_graph : SparseGraph
Graph with specified nodes removed.
"""
# Check that arguments are passed correctly
if _sentinel is not None:
raise ValueError(
"Only call `create_subgraph` with named arguments',"
" (nodes_to_remove=...) or (nodes_to_keep=...)"
)
if nodes_to_remove is None and nodes_to_keep is None:
raise ValueError("Either nodes_to_remove or nodes_to_keep must be provided.")
elif nodes_to_remove is not None and nodes_to_keep is not None:
raise ValueError(
"Only one of nodes_to_remove or nodes_to_keep must be provided."
)
elif nodes_to_remove is not None:
nodes_to_keep = [
i for i in range(sparse_graph.num_nodes()) if i not in nodes_to_remove
]
elif nodes_to_keep is not None:
nodes_to_keep = sorted(nodes_to_keep)
else:
raise RuntimeError("This should never happen.")
sparse_graph.adj_matrix = sparse_graph.adj_matrix[nodes_to_keep][:, nodes_to_keep]
if sparse_graph.attr_matrix is not None:
sparse_graph.attr_matrix = sparse_graph.attr_matrix[nodes_to_keep]
if sparse_graph.labels is not None:
sparse_graph.labels = sparse_graph.labels[nodes_to_keep]
if sparse_graph.node_names is not None:
sparse_graph.node_names = sparse_graph.node_names[nodes_to_keep]
return sparse_graph
def binarize_labels(labels, sparse_output=False, return_classes=False):
"""Convert labels vector to a binary label matrix.
In the default single-label case, labels look like
labels = [y1, y2, y3, ...].
Also supports the multi-label format.
In this case, labels should look something like
labels = [[y11, y12], [y21, y22, y23], [y31], ...].
Parameters
----------
labels : array-like, shape [num_samples]
Array of node labels in categorical single- or multi-label format.
sparse_output : bool, default False
Whether return the label_matrix in CSR format.
return_classes : bool, default False
Whether return the classes corresponding to the columns of the label matrix.
Returns
-------
label_matrix : np.ndarray or sp.csr_matrix, shape [num_samples, num_classes]
Binary matrix of class labels.
num_classes = number of unique values in "labels" array.
label_matrix[i, k] = 1 <=> node i belongs to class k.
classes : np.array, shape [num_classes], optional
Classes that correspond to each column of the label_matrix.
"""
if hasattr(labels[0], "__iter__"): # labels[0] is iterable <=> multilabel format
binarizer = MultiLabelBinarizer(sparse_output=sparse_output)
else:
binarizer = LabelBinarizer(sparse_output=sparse_output)
label_matrix = binarizer.fit_transform(labels).astype(np.float32)
return (label_matrix, binarizer.classes_) if return_classes else label_matrix
def remove_underrepresented_classes(
g, train_examples_per_class, val_examples_per_class
):
"""Remove nodes from graph that correspond to a class of which there are less than
num_classes * train_examples_per_class + num_classes * val_examples_per_class nodes.
Those classes would otherwise break the training procedure.
"""
min_examples_per_class = train_examples_per_class + val_examples_per_class
examples_counter = Counter(g.labels)
keep_classes = set(
class_
for class_, count in examples_counter.items()
if count > min_examples_per_class
)
keep_indices = [i for i in range(len(g.labels)) if g.labels[i] in keep_classes]
return create_subgraph(g, nodes_to_keep=keep_indices)