forked from 33onethird/malware-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_vectors.py
executable file
·97 lines (91 loc) · 5.34 KB
/
gen_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from csv import reader
from math import ceil
from pickle import dump, load
from os import listdir, mkdir
from os.path import basename, isdir, join, splitext
features_path = 'gen/features.p'
labels_path = 'labels.csv'
in_directory = 'feature_vectors'
observations_per_file = 1000
out_directory = 'gen/observations'
def generate_vectors(features_path=features_path, labels_path=labels_path, in_directory=in_directory,
observations_per_file=observations_per_file, out_directory=out_directory):
if not isdir(out_directory):
mkdir(out_directory)
with open(features_path, 'rb') as file:
features = load(file)
if labels_path is not None:
with open(labels_path, 'r') as file:
label_reader = reader(file)
next(label_reader)
positive_observations = []
for row in label_reader: # Without header row
positive_observations.append(row[0])
paths = [join(in_directory, f) for f in listdir(in_directory)]
fill_nr = len(str(len(paths) // observations_per_file))
last_nr = 0
if isdir(out_directory) and len(listdir(out_directory)) > 0:
last_nr = sorted(listdir(out_directory))[-1]
last_nr = int(last_nr.split('.')[0])
observations = []
for i in range(len(paths)):
path = paths[i]
print('Observation ' + str(i) + ' of ' + str(len(paths)), path)
with open(path, 'r') as file:
observation = []
lines = file.read().splitlines()
for feature in features:
if feature in lines:
observation.append(1)
else:
observation.append(0)
if labels_path is not None:
if splitext(basename(path))[0] in positive_observations:
observation.append(1)
else:
observation.append(0)
observations.append(observation)
if (i + 1) % observations_per_file == 0:
filename = str((i + 1 + last_nr) // observations_per_file).zfill(fill_nr) + '.p'
with open(join(out_directory, filename), 'wb') as file:
dump(observations, file)
observations = []
if len(observations) > 0: # Dump the rest observations
filename = str(int(ceil(i + last_nr / observations_per_file))) + '.p'
with open(join(out_directory, filename), 'wb') as file:
dump(observations, file)
if __name__ == "__main__":
parser = ArgumentParser(description='Generates python vectors from string observations',
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('-n', '--number', type=int, default=observations_per_file, help='Number of observations per '
'outputted file. The higher '
'the number, the higher the '
'memory requirements.')
parser.add_argument('-i', '--input', type=str, default=in_directory, help='The directory in which the input '
'observations in their raw string format '
'are located. Must be the same directory '
'that was also used as input directory for '
'`acc_features.py`.')
parser.add_argument('-l', '--labels', type=str, default=labels_path, help='The path to a CSV file that assigns the '
'positive class label to certain '
'observations contained in the input '
'directory. It contains two columns: '
'An observation id column and a malware '
'family column. Pass `None` to skip'
'labelling')
parser.add_argument('-f', '--features', type=str, default=features_path,
help='The path to the file which contains all '
'possible features in a binary format. '
'This should be the output file of '
'`acc_features.py`.')
parser.add_argument('-o', '--output', type=str, default=out_directory, help='The directory which the output files '
'should be written to.')
args = parser.parse_args()
features_path = args.features
labels_path = args.labels
in_directory = args.input
observations_per_file = args.number
out_directory = args.output
generate_vectors(features_path, labels_path, in_directory, observations_per_file, out_directory)