helpers.py

import numpy as np
from os import listdir
from os.path import isdir, join
from pickle import load
from random import sample, shuffle
from sklearn.metrics import confusion_matrix


class NNData:
    """
    Provides an interface to load training and test datasets for `partial_fit` directly from the disk.
    """
    training_paths = None

    def __init__(self, directory='gen/observations'):
        self.directory = directory
        self.paths = [join(directory, f) for f in listdir(directory)]
        shuffle(self.paths)

    def get_input_dim(self):
        """
        Returns the input dimension for a neural network (ie the dimensionality of the input vectors).
        :return: The input dimension for a neural network (ie the dimensionality of the input vectors).
        """
        with open(self.paths[0], 'rb') as file:
            x = load(file)
            x = np.array(x)
            return len(x[0]) - 1

    def training_generator(self, amount=10):
        """
        A generator that yields `amount` training set batches.
        :param amount: The amount of batches to yield.
        :return: Two numpy arrays: One containing the batch training observations and another containing the batch
        labels.
        """
        paths = self.paths[:amount]
        self.training_paths = paths
        for i in range(amount):
            with open(paths[i], 'rb') as file:
                x = load(file)
                x = np.array(x)
                x_tr = x[:, :-1]
                y_tr = x[:, -1]
                print(x_tr.shape)
                yield x_tr, y_tr

    def get_test_data(self, amount=None):
        """
        Gets a test dataset that is independent from the generated batches in `training_generator`.
        :param amount: The amount of batches to load. If `None`, takes all available batches that were not used in
        `training_generator`.
        :return: Two numpy arrays: One containing the test observations and another containing the test labels.
        """
        paths = [join(self.directory, f)
                 for f in listdir(self.directory) if join(self.directory, f) not in self.training_paths]
        x = []
        if amount is None:
            for path in paths:
                with open(path, 'rb') as file:
                    x += load(file)
        else:
            for i in range(amount):
                with open(paths[i], 'rb') as file:
                    x += load(file)
        x = np.array(x)
        x_te = x[:, :-1]
        y_te = x[:, -1]
        return x_te, y_te


def load_observations(amount=1, directory='gen/observations', mode='firstk'):
    """
    Loads the observations from the specified directory.
    :param amount: The amount of files which should be loaded. Be careful with high numbers, as the observations quickly
                    exceed the memory limit. Needs to be a positive integer.
    :param directory: The directory to load observations from.
    :param mode: Specifies how the `amount` batches should be drawn from all batches. Possible values:
    * firstk: Takes the first `amount` batches
    * sample: Samples `amount` batches uniformly from all batches
    * lastk: Takes the last `amount` batches
    :return: A two-dimensional numpy array containing the observations in rows and their features in columns.
    """
    if amount < 1:
        raise ValueError('Amount needs to be a positive integer')
    if not isdir(directory):
        raise ValueError('Directory must exist')
    paths = [join(directory, f) for f in listdir(directory)]
    if mode == 'firstk' or mode == 'lastk':
        paths = sorted(paths)
        if mode == 'lastk':
            paths = reversed(paths)
        x = []
        for i in range(amount):
            with open(paths[i], 'rb') as file:
                x += load(file)
    elif mode == 'sample':
        batches = sample(paths, amount)
        x = []
        for batch in batches:
            with open(batch, 'rb') as file:
                x += load(file)
    return np.array(x)


def split_data(x, train_pct):
    """
    Splits observations in a training and test dataset.
    :param x: The observations as numpy array that contain class labels in the last row.
    :param train_pct: The percentage of observations to use for training. 1 - `training_pct` is used for the test set.
    :return: Four numpy arrays:
            1. Training observations without class labels
            2. Test observations without class labels
            3. Trainings class labels
            4. Test class labels
    """
    y_tr, y_te = None, None
    while y_te is None or 1 not in y_te:  # Ensure that test dataset does not contain solely benignware
        training_size = int(len(x) * train_pct)
        indices = np.random.permutation(len(x))
        training_idx, test_idx = indices[:training_size], indices[training_size:]
        x_tr, x_te = x[training_idx, :], x[test_idx, :]
        y_tr, y_te = x_tr[:, -1], x_te[:, -1]
        x_tr, x_te = x_tr[:, :-1], x_te[:, :-1]  # Last column is label column
    return x_tr, x_te, y_tr, y_te


def measure_performance(predictions, ground_truth):
    """
    Measures the performance of a Support Vector Machine model.
    :param s: The Support Vector Machine that was already trained.
    :param test_observations: The observations of the test dataset without class labels.
    :param ground_truth: The class labels of the test dataset.
    :return: The True Positive Rate (True Positives / Real Positives) and
            False Positive Rate (False Positives / Real Negatives)
    """
    cm = confusion_matrix(ground_truth, predictions)
    tn = cm[0][0]
    fn = cm[0][1]
    fp = cm[1][0]
    tp = cm[1][1]
    if (tp + fn) == 0:
        tpr = 0
    else:
        tpr = tp / (tp + fn)
    if fp + tn == 0:
        fpr = 0
    else:
        fpr = fp / (fp + tn)
    print('True positives', tp, 'False positives', fp, 'True negatives', tn, 'False negatives', fn)
    return tpr, fpr