-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessData.py
114 lines (91 loc) · 3.84 KB
/
preprocessData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from datetime import datetime
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn import preprocessing
def feature_scaling(data, numeric_attrs):
for i in numeric_attrs:
std = data[i].std()
if std != 0:
data[i] = (data[i] - data[i].mean()) / std
else:
data = data.drop(i, axis=1)
return data
def encode_cate_attrs(data, cate_attrs):
data = encode_edu_attrs(data)
cate_attrs.remove('education')
for i in cate_attrs:
dummies_df = pd.get_dummies(data[i])
dummies_df = dummies_df.rename(columns=lambda x: i + '_' + str(x))
data = pd.concat([data, dummies_df], axis=1)
data = data.drop(i, axis=1)
return data
def encode_bin_attrs(data, bin_attrs):
for i in bin_attrs:
data.loc[data[i] == 'no', i] = 0
data.loc[data[i] == 'yes', i] = 1
return data
def encode_edu_attrs(data):
values = ["illiterate", "basic.4y", "basic.6y", "basic.9y",
"high.school", "professional.course", "university.degree"]
levels = range(1, len(values) + 1)
dict_levels = dict(zip(values, levels))
for v in values:
data.loc[data['education'] == v, 'education'] = dict_levels[v]
return data
def trans_num_attrs(data, numeric_attrs):
bining_num = 10
bining_attr = 'age'
data[bining_attr] = pd.qcut(data[bining_attr], bining_num)
data[bining_attr] = pd.factorize(data[bining_attr])[0] + 1
for i in numeric_attrs:
scaler = preprocessing.StandardScaler()
data[i] = scaler.fit_transform(data[i].values.reshape(-1,1))
return data
def fill_unknown(data, bin_attrs, cate_attrs, numeric_attrs):
# fill_attrs = ['education', 'default', 'housing', 'loan']
fill_attrs = []
for i in bin_attrs + cate_attrs:
if data[data[i] == 'unknown']['y'].count() < 500:
# delete col containing unknown
data = data[data[i] != 'unknown']
else:
fill_attrs.append(i)
data = encode_cate_attrs(data, cate_attrs)
data = encode_bin_attrs(data, bin_attrs)
data = trans_num_attrs(data, numeric_attrs)
data['y'] = data['y'].map({'no': 0, 'yes': 1}).astype(int)
for i in fill_attrs:
test_data = data[data[i] == 'unknown']
testX = test_data.drop(fill_attrs, axis=1)
train_data = data[data[i] != 'unknown']
trainY = train_data[i]
trainX = train_data.drop(fill_attrs, axis=1)
test_data[i] = train_predict_unknown(trainX, trainY.astype('int'), testX)
data = pd.concat([train_data, test_data])
return data
def train_predict_unknown(trainX, trainY, testX):
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(trainX, trainY)
test_predictY = forest.predict(testX).astype(int)
return pd.DataFrame(test_predictY, index=testX.index)
def preprocess_data():
input_data_path = "D:/code/sparkProject/sparkInput/bank-additional/bank-additional-full.csv"
processed_data_path = "D:/code/sparkProject/sparkInput/bank-additional/bank-additional-full-processed.csv"
print("Loading data...")
data = pd.read_csv(input_data_path, sep=';')
print("Preprocessing data...")
numeric_attrs = ['age', 'duration', 'campaign', 'pdays', 'previous',
'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
'euribor3m', 'nr.employed', ]
bin_attrs = ['default', 'housing', 'loan']
cate_attrs = ['poutcome', 'education', 'job', 'marital',
'contact', 'month', 'day_of_week']
data = shuffle(data)
data = fill_unknown(data, bin_attrs, cate_attrs, numeric_attrs)
data.to_csv(processed_data_path, index=False)
start_time = datetime.now()
preprocess_data()
end_time = datetime.now()
delta_seconds = (end_time - start_time).seconds
print("Cost time: {}s".format(delta_seconds))