-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhelpers.py
151 lines (139 loc) · 5.84 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
from os import listdir
from os.path import isdir, join
from pickle import load
from random import sample, shuffle
from sklearn.metrics import confusion_matrix
class NNData:
"""
Provides an interface to load training and test datasets for `partial_fit` directly from the disk.
"""
training_paths = None
def __init__(self, directory='gen/observations'):
self.directory = directory
self.paths = [join(directory, f) for f in listdir(directory)]
shuffle(self.paths)
def get_input_dim(self):
"""
Returns the input dimension for a neural network (ie the dimensionality of the input vectors).
:return: The input dimension for a neural network (ie the dimensionality of the input vectors).
"""
with open(self.paths[0], 'rb') as file:
x = load(file)
x = np.array(x)
return len(x[0]) - 1
def training_generator(self, amount=10):
"""
A generator that yields `amount` training set batches.
:param amount: The amount of batches to yield.
:return: Two numpy arrays: One containing the batch training observations and another containing the batch
labels.
"""
paths = self.paths[:amount]
self.training_paths = paths
for i in range(amount):
with open(paths[i], 'rb') as file:
x = load(file)
x = np.array(x)
x_tr = x[:, :-1]
y_tr = x[:, -1]
print(x_tr.shape)
yield x_tr, y_tr
def get_test_data(self, amount=None):
"""
Gets a test dataset that is independent from the generated batches in `training_generator`.
:param amount: The amount of batches to load. If `None`, takes all available batches that were not used in
`training_generator`.
:return: Two numpy arrays: One containing the test observations and another containing the test labels.
"""
paths = [join(self.directory, f)
for f in listdir(self.directory) if join(self.directory, f) not in self.training_paths]
x = []
if amount is None:
for path in paths:
with open(path, 'rb') as file:
x += load(file)
else:
for i in range(amount):
with open(paths[i], 'rb') as file:
x += load(file)
x = np.array(x)
x_te = x[:, :-1]
y_te = x[:, -1]
return x_te, y_te
def load_observations(amount=1, directory='gen/observations', mode='firstk'):
"""
Loads the observations from the specified directory.
:param amount: The amount of files which should be loaded. Be careful with high numbers, as the observations quickly
exceed the memory limit. Needs to be a positive integer.
:param directory: The directory to load observations from.
:param mode: Specifies how the `amount` batches should be drawn from all batches. Possible values:
* firstk: Takes the first `amount` batches
* sample: Samples `amount` batches uniformly from all batches
* lastk: Takes the last `amount` batches
:return: A two-dimensional numpy array containing the observations in rows and their features in columns.
"""
if amount < 1:
raise ValueError('Amount needs to be a positive integer')
if not isdir(directory):
raise ValueError('Directory must exist')
paths = [join(directory, f) for f in listdir(directory)]
if mode == 'firstk' or mode == 'lastk':
paths = sorted(paths)
if mode == 'lastk':
paths = reversed(paths)
x = []
for i in range(amount):
with open(paths[i], 'rb') as file:
x += load(file)
elif mode == 'sample':
batches = sample(paths, amount)
x = []
for batch in batches:
with open(batch, 'rb') as file:
x += load(file)
return np.array(x)
def split_data(x, train_pct):
"""
Splits observations in a training and test dataset.
:param x: The observations as numpy array that contain class labels in the last row.
:param train_pct: The percentage of observations to use for training. 1 - `training_pct` is used for the test set.
:return: Four numpy arrays:
1. Training observations without class labels
2. Test observations without class labels
3. Trainings class labels
4. Test class labels
"""
y_tr, y_te = None, None
while y_te is None or 1 not in y_te: # Ensure that test dataset does not contain solely benignware
training_size = int(len(x) * train_pct)
indices = np.random.permutation(len(x))
training_idx, test_idx = indices[:training_size], indices[training_size:]
x_tr, x_te = x[training_idx, :], x[test_idx, :]
y_tr, y_te = x_tr[:, -1], x_te[:, -1]
x_tr, x_te = x_tr[:, :-1], x_te[:, :-1] # Last column is label column
return x_tr, x_te, y_tr, y_te
def measure_performance(predictions, ground_truth):
"""
Measures the performance of a Support Vector Machine model.
:param s: The Support Vector Machine that was already trained.
:param test_observations: The observations of the test dataset without class labels.
:param ground_truth: The class labels of the test dataset.
:return: The True Positive Rate (True Positives / Real Positives) and
False Positive Rate (False Positives / Real Negatives)
"""
cm = confusion_matrix(ground_truth, predictions)
tn = cm[0][0]
fn = cm[0][1]
fp = cm[1][0]
tp = cm[1][1]
if (tp + fn) == 0:
tpr = 0
else:
tpr = tp / (tp + fn)
if fp + tn == 0:
fpr = 0
else:
fpr = fp / (fp + tn)
print('True positives', tp, 'False positives', fp, 'True negatives', tn, 'False negatives', fn)
return tpr, fpr