-
Notifications
You must be signed in to change notification settings - Fork 15
/
antivirus.py
95 lines (72 loc) · 3.24 KB
/
antivirus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.ensemble as ske
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pickle
data = pd.read_csv('train.csv', sep=';')
X1 = data.drop(['type'], axis=1).values
y = data['type'].values
print('Researching important feature based on %i total features\n' % X1.shape[1])
# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X1, y)
model1 = SelectFromModel(fsel, prefit=True)
X_new = model1.transform(X1)
nb_features = X_new.shape[1]
features = []
print('%i features identified as important:' % nb_features)
xnew = [0]*nb_features
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
xnew[f]=data.columns[2+indices[f]];
print(" %d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))
res={}
for f in range(X1.shape[1]):
res[data.columns[f]]=0
for f in range(nb_features):
res[xnew[f]] = 1
i=0
xd = [0]*(X1.shape[1]-nb_features+1)
for f in range(X1.shape[1]):
if(res[data.columns[f]]==0):
xd[i]=data.columns[f]
i=i+1
xd[i]='type'
xfinal = data.drop(xd,axis=1).values
X_train, X_test, y_train, y_test = train_test_split(xfinal, y, test_size = 0.2, random_state = 0)
model=Sequential()
model.add(Dense(units=105, kernel_initializer='uniform', activation='relu',input_dim=nb_features))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dropout(p=0.2))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dropout(p=0.2))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dropout(p=0.2))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=105,kernel_initializer='uniform',activation='relu'))
model.add(Dense(units=1,kernel_initializer='uniform',activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=60,epochs=250)
model.save('malware.h5')
model.save_weights("malware.hdf5")
open('classifier/features.pkl','wb').write(pickle.dumps(xnew))
y_pred=model.predict(X_test)
y_pred=(y_pred>0.6)
mt=confusion_matrix(y_test,y_pred)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
print(mt)