-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeneral_kfold.py
168 lines (149 loc) · 7.43 KB
/
general_kfold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
Author : Zach Seiss
Email : [email protected]
Written : May 25, 2022
Last Update : June 5, 2022
"""
import numpy as np
import pandas as pd
import random
from datetime import datetime
from time import time
from bayes_net_model import make_bn
from optimized_query import fast_query
from get_client_spreadsheet import return_client_csv
# we will define variables begin and end to keep track of program execution time
begin = time()
random.seed(0)
DROP_CUTOFF = 20
df = pd.read_csv('/media/zach/MULTIBOOT/ACS/ACST_Cust_Data.csv')
df = df.loc[df['MissingValues'] <= DROP_CUTOFF].reset_index()
K = 10
NUM_ROWS = len(df)
TARGET_VARIABLE = 'Satisfied'
index = list(range(len(df)))
random_sample = df.iloc[np.array(random.sample(index, NUM_ROWS))]
sample_index = random_sample.index
'''
We let n = length of index mod k. That is, if we split the random sample into k equal sized groups,
there will be n < k remaining unassigned elements in the sample. So we remove the last n elements from
index, split index into k equal sized groups then assign the remaining n elements to the first n of the
k groups.
'''
n = NUM_ROWS % K
remaining_indices = sample_index[NUM_ROWS - n:]
mod_sample_index = sample_index[: NUM_ROWS - n]
test_group_indexes = np.split(np.array(mod_sample_index), K)
for i in range(n):
test_group_indexes[i] = np.append(test_group_indexes[i], (remaining_indices[i]))
'''
We get a list of the training group indexes by removing the
indexes associated with the ith testing group from the list of indices of
the full data set
'''
# train_group_indexes is an array of length k which contains the index for each training group
train_group_indexes = [sample_index.drop(test_group_indexes[i]) for i in range(K)]
training_groups = [df.iloc[train_group_indexes[i]] for i in range(K)]
'''
for each training group we have to train a new BN. Then we will query that BN for each member
of the associated testing group and compare its max likelihood prediction against the true value.
'''
''' AS CURRENTLY IMPLEMENTED, THIS PROGRAM WILL FAIL FOR LESS THAN 3 NODE BNs!!!!
'''
bayesian_networks = []
for i in range(K):
bn = make_bn(training_groups[i], [('Var1', 'Target'),
('Var2', 'Target'),
('MissingValues', 'Target'),
('Product', 'Target'),
('Target', 'Var3_grouped'),
('Target', 'Var4_grouped'),
('Target', 'BinVar1'),
('Target', 'BinVar2'),
('Target', 'BinVar3')])
bayesian_networks.append(bn)
'''
Now we have to create a VariableElimination object from each BayesianNetwork object in order
to do inference (we need to run queries).
'''
test_groups = [df.iloc[test_group_indexes[i]] for i in range(K)]
test_group_sizes = np.array([elem.size for elem in test_group_indexes])
train_group_sizes = np.array([elem.size for elem in train_group_indexes])
environment_variables = [variable for variable in bayesian_networks[0]]
environment_variables.remove(TARGET_VARIABLE)
'''
The function fast_query will query all of the bayesian networks with the whole environment
map and map the queries to their respective outputs, reducing computation time by
eliminating repeat calculations.
'''
fq, num_queries, method_used, external_errors = fast_query(bayesian_networks,
test_group_indexes,
environment_variables,
df,
TARGET_VARIABLE)
validations = []
high_risk_group = []
moderate_risk_group = []
false_negatives = []
error_count = 0
for i in range(K):
validation = []
false_negative_lst = []
for j in range(test_group_sizes[i]):
# 'state_instantiation' is a Series from which we can obtain the instantiated state variables.
state_instantiation = test_groups[i].iloc[j][environment_variables]
# 'client_ID is the ID in the dataset of the row we are currently investigating
client_ID = test_groups[i].iloc[j]['ID']
# 'prediction' is the max likelihood state of the target variable given the states of the other variables.
prediction = fq[i].loc[tuple(state_instantiation.values)]['0_y']
# 'actual_target_value' is the true value of the state variable we are trying to predict.
actual_target_value = test_groups[i].iloc[j][TARGET_VARIABLE]
try:
validation.append((prediction > .5) == actual_target_value)
if (prediction < .60) and (prediction > .5) and actual_target_value:
moderate_risk_group.append((client_ID, prediction))
elif not round(prediction) and actual_target_value:
false_negative_lst.append(client_ID)
high_risk_group.append((client_ID, prediction))
except (ValueError, TypeError) as e:
error_count += 1
print(e)
false_negatives.append(false_negative_lst)
validations.append(np.array(validation))
# rc_sizes is the number of false negatives in each testing group which we use in an error computation later.
rc_sizes = np.array([len(lst) for lst in false_negatives])
# data_on_risky_clients = df.loc[df['ID'].isin(risky_clients)]
""" ERROR CALCULATION """
num_correct_predictions = np.array([np.sum(validation) for validation in validations])
group_prediction_accuracies = num_correct_predictions / (test_group_sizes - error_count)
group_prediction_accuracies_fn = num_correct_predictions / (test_group_sizes - rc_sizes - error_count)
mean_fn = np.mean(group_prediction_accuracies_fn)
std_fn = np.std(group_prediction_accuracies_fn)
mean = np.mean(group_prediction_accuracies)
std = np.std(group_prediction_accuracies)
""" REPORT PRINTING """
date_stamp = datetime.now()
end = time()
bn = bayesian_networks[0]
file_name = return_client_csv(high_risk_lst=high_risk_group,
moderate_risk_lst=moderate_risk_group,
data_frame=df)
report = f'################################################### {file_name} {date_stamp} >{DROP_CUTOFF} MissingValues dropped!!! ##################################################\n\n' \
f'Method Used : {method_used}\n' \
f'Prediction Accuracy : {round(mean, 5)}\n' \
f'Standard Deviation : {round(std, 5)}\n' \
f'Accuracy without "false negatives" : {round(mean_fn, 5)}\n' \
f'Standard Deviation without "false negatives" : {round(std_fn, 5)}\n' \
f'Execution Time : {round(((end - begin) / 60), 2)} minutes\n' \
f'The network was queried {num_queries} times. FastQuery saved {len(df) - num_queries} redundant queries.\n' \
f'Error count : {error_count + external_errors}\n' \
f'Nodes : {bn.nodes}\n' \
f'Edges : {bn.edges}\n' \
f'In Degree : {bn.in_degree}\n' \
f'Out Degree : {bn.out_degree}\n' \
f'States : {bn.states}\n\n\n\n'
print(report)
with open('BN_testing_new_query_evidence_style.txt', 'a') as file:
file.write('\n\n' + report)
with open(f'Client_Spreadsheets/{file_name}/{file_name}.txt', 'w+') as file:
file.write(report)