-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathikarus_prep.py
executable file
·59 lines (56 loc) · 2.85 KB
/
ikarus_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from os import chdir, listdir, getcwd, mkdir
from os.path import join, isdir
from csv import writer
from subprocess import call
from acc_features import accumulate_features
from gen_vectors import generate_vectors
extractor_path = '../FeatureExtractor'
featurecopy_path = '../filter'
parser = ArgumentParser(description='Prepares ikarus dataset', formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('data', type=str, help='The path to the directory containing clean (benignware) and malware apks.')
parser.add_argument('-s', '--string', type=str, default=None, help='Where to output the string features. Leave empty to'
'use same directory as `data`.')
parser.add_argument('-l', '--label', type=str, default=None, help='Path to the label csv output. Leave empty to use input'
'data directory.')
parser.add_argument('-e', '--extractor', type=str, default=extractor_path, help='The path to the directory containing '
'the feature extractor jar.')
parser.add_argument('--use-adware', type=bool, default=False, help='Set True to use apks in the `adware` directory as'
'malware. Otherwise they remain unused.')
args = parser.parse_args()
data_dir = args.data
output_dir = join(data_dir, 'gen')
string_dir = args.string if args.string is not None else join(output_dir, 'string_features')
reduced_string_dir = string_dir + '_reduced'
label_path = args.label if args.label is not None else join(output_dir, 'labels.csv')
extractor_path = args.extractor
features_path = join(output_dir, 'features.p')
vectors_dir = join(output_dir, 'observations')
malware_dir = join(data_dir, 'malware')
clean_dir = join(data_dir, 'clean')
if not isdir(output_dir):
mkdir(output_dir)
if not isdir(string_dir):
mkdir(string_dir)
if not isdir(reduced_string_dir):
mkdir(reduced_string_dir)
malware_paths = [f for f in listdir(malware_dir)]
with open(label_path, 'a') as file:
label_writer = writer(file)
label_writer.writerow(['name'])
for path in malware_paths:
label_writer.writerow([path])
wd = getcwd()
chdir(extractor_path)
for directory in [malware_dir, clean_dir]:
result = call(['java', '-jar', 'MultiExtractor.jar', directory, string_dir, 'cut', 'activity', 'service_receiver'])
if result != 0:
exit(result)
chdir(featurecopy_path)
result = call(['java', '-jar', 'FeatureCopy.jar', string_dir, reduced_string_dir])
if result != 0:
exit(result)
chdir(wd)
accumulate_features(reduced_string_dir, output_dir)
generate_vectors(features_path, label_path, reduced_string_dir, out_directory=vectors_dir)