-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml_data_prep.py
executable file
·108 lines (69 loc) · 3.13 KB
/
ml_data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# this file performs pre-processing for machine learning in the form of feature
# selection and removing correlated features. It can also preform high level
# data cleaning, as there are functions that would the user to remove columns
# that may have been useful for EDA, but aren't useful for ML.
# Another purpose of this file is that it's not per se project specific,
# I can pass datasets for multiple projects through this file and it
# will generate dummy variables and delete unneeded features without
# much customization
import pandas as pd
# data import function
def import_data(csv_file: str) -> object:
# function to import the data from a CSV file
data_ml = pd.read_csv(csv_file)
return (data_ml)
def delete_columns(data: object, column_list: list) -> object:
data.drop(column_list, inplace=True, axis=1)
return data
def add_dummies(data: pd.Dataframe) -> object:
# this function generates dummy variables and then
# deletes the "default" column as we're going to be
# predicting "paid"
# this is the only function that's specific for the
# lending dataset
# add dummy variables to the data frame
data = pd.get_dummies(data)
data.rename(columns={'loan_status_Fully Paid': 'paid'}, inplace=True)
data.rename(columns={'loan_status_Charged Off': 'default'}, inplace=True)
columns = ['default']
data.drop(columns, inplace=True, axis=1)
return data
def find_correlations(data: pd.Dataframe, threshold: float) -> list:
# identify correlated features and then remove them from the
# data 'data_ml' data frame
correlated_features = set()
# drop the target variable (paid) and others that
# aren't just included for annualized return calculations
# not for machine learning
# columns to drop
columns = ['paid', 'total_payments', 'net_gain']
cordata = data.drop(columns, axis=1)
cor_matrix = cordata.corr()
for i in range(len(cor_matrix.columns)):
for j in range(i):
if abs(cor_matrix.iloc[i, j]) > threshold:
colname = cor_matrix.columns[i]
correlated_features.add(colname)
return list(correlated_features)
def write_data(data: object, file_name: str):
# writes data to a csv when passed a data frame and a string with the
# file name
data.to_csv(file_name, index=False)
return (print('file processing complete'))
def main():
ml_data = import_data('data/LC_2015_clean(4)_updated_April2022.csv')
# list of columns to delete
column_list = ['issue_d', 'emp_length',
'earliest_cr_line', 'addr_state',
'annual_inc', 'purpose', 'dti',
'total_pymnt', 'lost_principle',
'total_pymnt_inv', 'monthly_income',
'total_rec_late_fee', 'revol_bal',
'monthly_debt_payments',
'updated_monthly_debt_payments',
'total_rec_int', 'total_rec_prncp']
data = delete_columns(ml_data, column_list)
data = add_dummies(data)
write_data(data, 'data/updated_lc_ML_ready_data_april2022.csv')
if __name__ == '__main__':
main()