forked from zhangxiaoyu11/OmiVAE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassification.py
87 lines (72 loc) · 3.54 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import svm
from sklearn import metrics
def classification(latent_code, random_seed=42, ten_fold=False):
tumour_type = pd.read_csv('data/PANCAN/GDC-PANCAN_both_samples_tumour_type.tsv', sep='\t', index_col=0)
latent_code_label = pd.merge(latent_code, tumour_type, left_index=True, right_index=True)
# separate latent variables and targets
label = latent_code_label[['tumour_type']]
data = latent_code_label.iloc[:, :-1]
X = data.values
y = label.values.ravel()
if ten_fold:
# 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)
accuracy_array = np.zeros(10)
precision_array = np.zeros(10)
recall_array = np.zeros(10)
f1_array = np.zeros(10)
i = 0
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# Use SVM as classifier
clf = svm.SVC(gamma='scale', random_state=random_seed)
clf.fit(X_train, y_train)
# Test the classifier using the testing set
y_pred = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1 = metrics.f1_score(y_test, y_pred, average='weighted')
# Store the metrics
accuracy_array[i] = accuracy
precision_array[i] = precision
recall_array[i] = recall
f1_array[i] = f1
i = i + 1
accuracy_average = np.mean(accuracy_array)
precision_average = np.mean(precision_array)
recall_average = np.mean(recall_array)
f1_average = np.mean(f1_array)
accuracy_std = accuracy_array.std()
precision_std = precision_array.std()
recall_std = recall_array.std()
f1_std = f1_array.std()
print('{:.2f}±{:.2f}%'.format(accuracy_average * 100, accuracy_std * 100))
print('{:.3f}±{:.3f}'.format(precision_average, precision_std))
print('{:.3f}±{:.3f}'.format(recall_average, recall_std))
print('{:.3f}±{:.3f}'.format(f1_average, f1_std))
else:
testset_ratio = 0.2
valset_ratio = 0.5
# Just one separation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testset_ratio, random_state=random_seed,
stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=valset_ratio,
random_state=random_seed, stratify=y_test)
# Use SVM as classifier
clf = svm.SVC(gamma='scale', random_state=random_seed)
clf.fit(X_train, y_train)
# Test the classifier using the testing set
y_pred = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1 = metrics.f1_score(y_test, y_pred, average='weighted')
print('{:.2f}'.format(accuracy * 100))
print('{:.2f}'.format(precision * 100))
print('{:.2f}'.format(recall * 100))
print('{:.2f}'.format(f1 * 100))