-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.py
88 lines (84 loc) · 3.69 KB
/
kernel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import networkx as nx
import numpy as np
from cvxopt import matrix, solvers
### Loading the data
graphs = pd.read_pickle('training_data.pkl')
# Transforming the labels into {-1, 1} for simplicity
labels = 2 * np.array(pd.read_pickle('training_labels.pkl')) - 1
test = pd.read_pickle('test_data.pkl')
# Weisfeiler-Lehman Subtree Kernel
def wlsk(X, h=5):
# First convert all node labels to string instead of lists of one integer
n = len(X)
# Compute the inverse of the size of each graph for normalizing later
inv_graph_sizes = np.zeros(n)
for i, g in enumerate(X):
labels = nx.get_node_attributes(g, 'labels')
labels = {j: {'labels': str(lbl[0])} for j, lbl in labels.items()}
nx.set_node_attributes(g, labels)
inv_graph_sizes[i] = 1 / g.number_of_nodes()
K = np.zeros((n, n))
# We know the set of possible node labels in all graphs before the first label propagation
ind_labels = {str(i): i for i in range(50)}
for step in range(h):
print(f"============ Iteration {step} / {h} ===========")
# Remember the labels created with this propagation
new_ind_labels = set()
# Propagate the labels and compute histograms of node labels
histograms = np.zeros((n, len(ind_labels)))
for i, g in enumerate(X):
new_labels = {}
for node, attrs in g.nodes(data=True):
label = attrs['labels']
histograms[i, ind_labels[label]] += 1
new_label = []
# Propagate neighbor labels
for neighbor in g.neighbors(node):
new_label.append(g.nodes[neighbor]['labels'])
# Transform the multiset of labels into a new label
new_label = ''.join(sorted(new_label))
new_labels[node] = {'labels': new_label}
new_ind_labels.add(new_label)
# Update the labels in the graph
nx.set_node_attributes(g, new_labels)
# Add the vertex histogram kernel values corresponding to this propagation
K += (inv_graph_sizes[:, None] * histograms) @ (inv_graph_sizes[:, None] * histograms).T
# Create indexing for the new labels
ind_labels = {lbl: i for i, lbl in enumerate(new_ind_labels)}
return K / h
# Compute the Gramm matrix of both the train and test graphs
print('Computing the Gram matrix using the Weisfeiler-Lehman Subtree Kernel')
K = wlsk(graphs + test, h=5)
n = len(graphs)
K_train = K[:n, :n]
K_test = K[n:, :n]
# Define the parameters of the convex optimization problem for kernel C-SVM
# Put class weight so that samples with label 1 (10%) have a bigger impact on the learning
class_weights = np.array([0, 10, 1])
C = 1e-3
P = np.outer(labels, labels) * K_train
w = class_weights[labels]
P *= np.outer(w, w)
q = -np.ones(n)
G = np.vstack([-np.eye(n), np.eye(n)])
h = np.hstack([np.zeros(n), C * np.ones(n)])
A = labels.reshape((1, -1))
b = np.zeros(1)
# Solve the optimization problem
print('Solving the optimization problem for kernel C-SVM')
P, q, G, h, A, b = map(lambda x: matrix(x, tc='d'), [P, q, G, h, A, b])
solvers.options['show_progress'] = True
solution = solvers.qp(P, q, G, h, A, b)
alphas = np.array(solution['x']).flatten()
sv_indices = np.where(alphas > 1e-7)[0]
print('Number of support vectors:', len(sv_indices))
sv_alphas = alphas[sv_indices]
sv_labels = labels[sv_indices]
b = np.mean(sv_labels - sv_alphas @ K_train[sv_indices][:, sv_indices])
Y_pred = ((np.sign(K_test[:, sv_indices] @ sv_alphas + b) + 1) / 2).astype(int)
sub = pd.DataFrame(Y_pred).reset_index()
sub.columns = ['Id', 'Predicted']
sub['Id'] += 1
sub.to_csv('test_pred.csv', index=False)
print('Saved the predictions in the test_pred.csv file')