forked from 33onethird/malware-test
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
executable file
·100 lines (89 loc) · 4.27 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from os import chdir, listdir, mkdir, getcwd
from os.path import isdir, isfile, join, expanduser
from pickle import load as pickle_load
from shutil import rmtree
from subprocess import call
import numpy as np
from sklearn.externals.joblib import load
from .gen_vectors import generate_vectors
algorithm = 'svm'
models_path = 'models'
extractor_path = '../FeatureExtractor'
featurecopy_path = '../filter'
gen_dir = expanduser('~/data/tmp')
features_path = 'gen/features.p'
def predict(data, alg=algorithm, models=models_path, features=features_path, extractor=extractor_path, featurecopy=featurecopy_path):
if isfile(models):
with open(models, 'rb') as file:
model = load(file)
else:
paths = sorted(listdir(join(models, alg)))
with open(join(models, alg, paths[-1]), 'rb') as file:
model = load(file)
if not isdir(gen_dir):
mkdir(gen_dir)
raw_string_dir = join(gen_dir, 'raw_txt')
string_dir = join(gen_dir, 'txt')
observations_dir = join(gen_dir, 'vec')
for dir in [raw_string_dir, string_dir, observations_dir]:
mkdir(dir)
wd = getcwd()
chdir(extractor)
result = call(['java', '-jar', 'MultiExtractor.jar', data, raw_string_dir, 'cut', 'activity', 'service_receiver'])
if result != 0:
exit(result)
chdir(featurecopy)
result = call(['java', '-jar', 'FeatureCopy.jar', raw_string_dir, string_dir])
if result != 0:
exit(result)
chdir(wd)
generate_vectors(features_path=features, in_directory=string_dir, out_directory=observations_dir,
labels_path=None)
observations_paths = listdir(observations_dir)
observations = []
for path in observations_paths:
with open(join(observations_dir, path), 'rb') as file:
observations += pickle_load(file)
observations = np.array(observations)
names = sorted(listdir(string_dir))
rmtree(gen_dir)
result = model.predict(observations)
return_dict = {}
for i in range(len(result)):
return_dict[names[i][:-4]] = result[i]
return return_dict
if __name__ == '__main__':
parser = ArgumentParser(description='Predicts unlabelled apps', formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('-a', '--algorithm', type=str, default=algorithm, help='The algorithm to use for prediction',
choices=['svm', 'lr', 'rf'])
parser.add_argument('-m', '--models', type=str, default=models_path,
help='The path to the trained algorithm model(s).'
'You can specify a file or a directory. If'
'you specify a directory, the latest'
'algorithm model file will be chosen. This'
'should be the output of `./experiments.py`)')
parser.add_argument('data', type=str, help='The path to the original apk files to predict. The directory needs'
' to include only valid .apk files')
parser.add_argument('-e', '--extractor', type=str, default=extractor_path,
help='The path to the directory containing '
'the feature extractor jar.')
parser.add_argument('-c', '--copy', type=str, default=featurecopy_path, help='The path to the directory containing '
'the feature copy jar.')
parser.add_argument('-f', '--features', type=str, default=features_path, help='The path to the file containing all'
'possible features. Should be the output'
'og `acc_features.py`.')
args = parser.parse_args()
algorithm = args.algorithm
models_path = args.models
data = args.data
extractor_path = args.extractor
copy_path = args.copy
features_path = args.features
results = predict(data, algorithm, models_path, features_path, extractor_path, copy_path)
print('============')
print('Result')
print('============')
for name, label in results:
print(name, label)