-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathML_Project.py
527 lines (410 loc) · 20.7 KB
/
ML_Project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
########################################################################
##### About
# Authors: Anish Chaluvadi and Thomas Delvaux
# ME-6930 036
# 04/19/2021
# For data obtained from:
# https://www.kaggle.com/c/titanic/data
show_figures = 0 # all figures are displayed in the code
# 0 ~ false; 1 ~ true
# WARNING: Requires graphviz (both the executable file and python library)
# on Windows the executables MUST be added to PATH
# You may need to restart computer after installation before use (namely if using Windows)
run_graphviz = 0 # runs graphviz code to create PDF visualizations
# for the Decision Tree model
print_extra_info = 0 # prints extra intermediary information
# as the script runs.
# WARNING: Need to install mlxtend data visualization package for some of the plots
# Can simply be done through conda-forge or using 'pip install mlxtend' in PyPI
########################################################################
##### Data Cleaning and Correlations
### Import Necessary Libraries
import numpy as np # Linear algebra
import pandas as pd # Data processing
import os # File locating
import seaborn as sns # Constructing graphs
import matplotlib.pyplot as plt # Plotting results
import random as rnd # Random number generator
### Importing Data
train_csv = r'data\train.csv' # Dataset to train machine learning (ML) algorithms
test_csv = r'data\test.csv' # Dataset to test ML algorithms
answer_csv = r'data\gender_submission.csv' # Answerkey dataset
combined_csv = r'data\test_train_combined.csv' # Dataset with merging solutions and training data into
# one big dataset
# Create Pandas Data Frames
combined_df = pd.read_csv(combined_csv, index_col=0)
combine = [combined_df] #[train_df, test_df] # Useful for filling in empty entries
#print(df.columns)
#print(df.shape)
# Checking for missing data
if print_extra_info == 1:
print('Empty entries before filling in age data:')
print(combined_df.isnull().sum())
# Create Heatmap of Entries Missing Data
# (uncomment the below lines to obtain the plot)
sns.heatmap(combined_df.isnull())
plt.title('Heatmap of Uncleaned Data (Empty Entries in White)')
plt.tight_layout()
plt.savefig('MissingDataCheck_Uncleaned.png')
if show_figures == 1:
plt.show()
plt.close()
### Data Cleanup
for dataset in combine: # Perform this action for both the testing and training datasets
# Removing unused columns because too much missing data
dataset.drop(['Ticket','Cabin'],inplace=True,axis=1)
## Making sex data usable
# Rename column
dataset.rename({'Sex' : 'Male'}, inplace=True, axis=1)
# Convert male column into usable binary data
dataset['Male'] = dataset['Male'].map( {'female': 0, 'male': 1} ).astype(int)
## Age
# Method 1 based on:
# https://www.kaggle.com/startupsci/titanic-data-science-solutions
# Plot relationship between gender, age, and pclass
# (uncomment the below lines to obtain the plot)
grid = sns.FacetGrid(combined_df, row='Pclass', col='Male', height=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()
plt.tight_layout()
plt.savefig('AgeDistribution.png')
if show_figures == 1:
plt.show()
plt.close()
# Fill in missing ages
guess_ages = np.zeros((2,3)) # Matrix to fill
for dataset in combine: # Perform this action for both the testing and training datasets
for i in range(0, 2): # Loop through sexes
for j in range(0, 3): # Loop through passenger classes
# Ignoring empty entries, obtain a list of ages for the specified sex and passenger class
guess_df = dataset[(dataset['Male'] == i) & \
(dataset['Pclass'] == j+1)]['Age'].dropna()
# Guess the median age for the specific sex and passenger class
age_guess = guess_df.median()
# Convert random age float to nearest .5 age
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
for i in range(0, 2): # Loop through sexes
for j in range(0, 3): # Loop through passenger classes
# Fill in missing ages based on the passenger's sex and class
dataset.loc[ (dataset.Age.isnull()) & (dataset.Male == i) & (dataset.Pclass == j+1),\
'Age'] = guess_ages[i,j]
# Make sure all entries in Age are now integers
dataset['Age'] = dataset['Age'].astype(int)
# Preview the data
#print(combined_df.head())
# Checking for missing data
#print('Empty entries after filling in age data:')
#print(combined_df.isnull().sum())
#print(combined_df.isnull())
### Feature Engineering
## Title
import re # Creating a function to extract titles from passenger names
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name) # If the title exists, extract and return it
if title_search:
return title_search.group(1)
return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in combine: # Perform this action for both the testing and training datasets
dataset['Title'] = dataset['Name'].apply(get_title) # Use the function
for dataset in combine: # Perform this action for both the testing and training datasets
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') # Group all non-common titles into one single grouping "Rare"
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') # Group Mlle and Ms into a single category "Miss"
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') # Group Mme with "Mrs" category
dataset.drop(['Name'],inplace=True,axis=1) # Removing unused column now that Title feature has been designed
## Family Size
combined_df['FamilySize'] = combined_df['SibSp'] + combined_df['Parch'] + 1 # Creates a separate single variable for family size
# Create Heatmap to Check Again for Entries Missing Data
# (uncomment the below lines to obtain the plot)
sns.heatmap(combined_df.isnull())
plt.tight_layout()
plt.savefig('MissingDataCheck_PartiallyCleaned.png')
if show_figures == 1:
plt.show()
plt.close()
# The heatmap should be a big red/pink square,
# which indicates that no empty entries are present
#
# If it is deep purple / navy blue, there are empty
# entries that somehow were missed. This should not
# happen.
# Checking for missing data
if print_extra_info == 1:
print('\nEmpty entries before removing missing data:')
print(combined_df.isnull().sum())
# Remove missing dats
# Based on: https://stackoverflow.com/questions/51374068/how-to-remove-a-row-which-has-empty-column-in-a-dataframe-using-pandas
# Removed any rows missing data. Should only remove 2 rows missing embarked data.
combined_df = combined_df.dropna()
# Confirm there are no more entries missing data
#print(combined_df.isnull().sum())
## Making embarkation data usable
# Empty entries had to be removed before this can be done.
# OPTION 1: Change strings to integers
#ports = {"S": 0, "C": 1, "Q": 2}
#combined_df['Embarked'] = combined_df['Embarked'].map(ports)
# OPTION 2: Split categorical data into separate boolean columns
combined_df = pd.get_dummies(combined_df)
# Split the string-based embarked data into three columns with boolean
# integers (in other words, convert the embarked data into a form that
# Random Forest can use)
# Most algorithms, including Random Forest and Decision Trees can't use
# columns with string-based data, making this conversion necessary.
# Preview New Data Format
#print(combined_df.head())
# Checking for missing data
if print_extra_info == 1:
print('\nEmpty entries after removing missing data:')
print(combined_df.isnull().sum())
# Create Heatmap to Check Again for Entries Missing Data
# (uncomment the below lines to obtain the plot)
sns.heatmap(combined_df.isnull())
plt.tight_layout()
plt.savefig('MissingDataCheck_Cleaned.png')
if show_figures == 1:
plt.show()
plt.close()
# The heatmap should be a big red/pink square,
# which indicates that no empty entries are present
#
# If it is deep purple / navy blue, there are empty
# entries that somehow were missed. This should not
# happen.
# Plot should show no missing entries, indicating the data is ready for use with ML
### Correlations
# Correlation code based on:
# https://likegeeks.com/python-correlation-matrix/
# Create Matrix of Correlations for Training Dataset
from sklearn.model_selection import train_test_split
corr = combined_df.copy()
X = corr.copy()
#X = dt.drop(['Survived','Embarked'], axis=1) #dt.iloc[:, [2,3]].values
y = corr['Survived'] #dt.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
correlation_mat = X_train.corr()
#correlation_mat = combined_df.corr()
# Create Heatmap of Correlations
sns.heatmap(correlation_mat, annot = True)
plt.title("Correlation matrix of Titanic roster data")
plt.xlabel("passenger features")
plt.ylabel("passenger features")
plt.tight_layout()
plt.savefig('Correlations.png')
if show_figures == 1:
plt.show()
plt.close()
########################################################################
##### Thomas's ML Algorithms
# Based on: https://www.kaggle.com/vanshjatana/applied-machine-learning
print('\nThomas\'s ML Algorithm Results:')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
## ML Algorithm 1: Decision Trees
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier # Older line of code, left for compatibility
#data = pd.read_csv('../input/classification-suv-dataset/Social_Network_Ads.csv')
dt = combined_df.copy()
#dt.head()
# X contains the data, Y contains the "solution"
# We drop the solution column "Survived" out of the data for testing and training.
# Column "Embarked" must also be dropped because it contained non-float data, which
# is incompatible with the Decision Trees algorithm
X = dt.drop(['Survived'], axis=1) # Include the whole dataset as features outside of the Survived
y = dt['Survived'] #dt.iloc[:, 4].values # Match the X with the Survived column from the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) # Splitting combined dataset into 80% train and 20% test
sc_X = StandardScaler() # Scale and normalize the features into normal distribution
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier=tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=100, max_depth=1, random_state=0) # Calling the decision tree classifier from sklearn
classifier.fit(X_train,y_train)
y_pred_dt=classifier.predict(X_test)
acc_dt=accuracy_score(y_test, y_pred_dt) # Calculate the accuracy of decision tree algorithm
print(f'Decision Trees Accuracy: {round(acc_dt*100,3)}%')
# Confusion Matrix
# Based on: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
sns.heatmap(confusion_matrix(y_test,y_pred_dt),annot=True,fmt='3.0f',cmap="Blues") # Plotting confusion matrix and cross-validation value for the decision tree algorithm
plt.title('Decision Trees Matrix', y=1.05, size=15)
plt.savefig('Confusion_Matrix_dt.png')
plt.tight_layout()
if show_figures==1:
plt.show()
plt.close()
# Documentation on decision trees:
# https://scikit-learn.org/stable/modules/tree.html#tree
# Generate decision tree visualization PDF
# WARNING: Requires graphviz (both the executable file and python library)
# You may need to restart computer after installation before use
# Helpful for getting class names from model:
# https://stackoverflow.com/questions/39476020/get-feature-and-class-names-into-decision-tree-using-export-graphviz
#print(classifier.classes_.astype(str))
#print(['Died', 'Survived'])
# Documentation on exporting to graphviz:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html
if run_graphviz == 1:
import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,
feature_names=X.columns,
# Use 'Died' for '0' and 'Survived' for '1' in class name
class_names=['Died', 'Survived'], #classifier.classes_.astype(str),
filled=True, rounded=True,
special_characters=True,
proportion=True)
graph = graphviz.Source(dot_data)
graph.render("titanic_decision-tree")
#print(y_train.astype(str).loc[:])
# Feature Importance for Decision Tree
# Loosely based on https://machinelearningmastery.com/calculate-feature-importance-with-python/ but mainly from Anish's previous research experience
# For more comments on each line of code, see the feature importance section for random forest algorithm
importance = [] # Create an array called importance
names = [] # Create an array called names
features = [r'Pclass', r'Male', r'Age', r'SibSp', r'Parch', r'Fare', r'Family Size', r'Embarked_C', r'Embarked_Q', r'Embarked_S', r'Title_Master', r'Title_Miss', r'Title_Mr', r'Title_Mrs', r'Title_Rare'] # List all selected features (to be used as x-axis labels in the plot)
for i in range(len(classifier.feature_importances_)):
if classifier.feature_importances_[i] > 0.005: # 0.005 is a general alpha threshold value chosen to determine an "important" feature
importance.append(classifier.feature_importances_[i]) # Add importance value to importance array
names.append(features[i]) # Add feature name to names array
#print(importance)
#print(names)
#print(len(names))
fig, ax = plt.subplots(figsize=(14, 6))
y_pos = np.arange(len(names))
bar_width = 0.20
opacity = 0.5
plt.barh(y_pos + 0*bar_width, importance, alpha=opacity, color='b', label='xxx')
plt.yticks(y_pos, names)
plt.tick_params(axis='x', labelsize = 15)
plt.tick_params(axis='y', labelsize = 15)
plt.grid(False)
plt.ylabel('Features', fontsize = 20)
plt.tight_layout()
fig.savefig('feature_importance_dt.png', bbox_inches='tight', dpi=400);
if show_figures==1:
plt.show()
plt.close()
## ML Algorithm 2: Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = combined_df.copy()
#rf.head()
X = rf.drop(['Survived'], axis=1)
y = rf['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, max_depth=6, random_state=0)
model.fit(X_train, y_train)
Y_pred_rf = model.predict(X_test)
acc_rf = model.score(X_test, y_test)
print(f'Random Forest Accuracy: {round(acc_rf*100,3)}%')
# Confusion Matrix
# Based from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
sns.heatmap(confusion_matrix(y_test,Y_pred_rf),annot=True,fmt='3.0f',cmap="Blues")
plt.title('Random Forest Confusion Matrix', y=1.05, size=15)
plt.tight_layout()
plt.savefig('Confusion_Matrix_rf.png')
if show_figures==1:
plt.show()
plt.close()
# Feature Importance for Random Forest
# Loosely based on https://machinelearningmastery.com/calculate-feature-importance-with-python/ but mainly from Anish's previous research experience
importance = [] # Create an array called importance
names = [] # Create an array called names
features = [r'Pclass', r'Male', r'Age', r'SibSp', r'Parch', r'Fare', r'Family Size', r'Embarked_C', r'Embarked_Q', r'Embarked_S', r'Title_Master', r'Title_Miss', r'Title_Mr', r'Title_Mrs', r'Title_Rare'] # List all selected features (to be used as x-axis labels in the plot)
for i in range(len(model.feature_importances_)):
if model.feature_importances_[i] > 0.005: # 0.005 is a general alpha threshold value chosen to determine an "important" feature
importance.append(model.feature_importances_[i]) # Add importance value to importance array
names.append(features[i]) # Add feature name to names array
#print(importance)
#print(names)
#print(len(names))
fig, ax = plt.subplots(figsize=(14, 6))
y_pos = np.arange(len(names))
bar_width = 0.20
opacity = 0.5
plt.barh(y_pos + 0*bar_width, importance, alpha=opacity, color='b', label='xxx')
plt.yticks(y_pos, names)
plt.tick_params(axis='x', labelsize = 15)
plt.tick_params(axis='y', labelsize = 15)
plt.grid(False)
plt.ylabel('Features', fontsize = 20)
plt.tight_layout()
fig.savefig('feature_importance_rf.png', bbox_inches='tight', dpi=400);
if show_figures==1:
plt.show()
plt.close()
########################################################################
##### Anish's ML Algorithms
print('\nAnish\'s ML Algorithm Results:')
# Based on: https://www.kaggle.com/vinothan/titanic-model-with-90-accuracy; https://www.kaggle.com/startupsci/titanic-data-science-solutions#Titanic-Data-Science-Solutions
## ML Algorithm 3: Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_df = combined_df.copy() # Create a copy of the dataframe for Logistic Regression algorithm
X = lr_df.drop(['Survived'], axis=1) # Establish all the features to be used in the algorithm (i.e. drop the survived column)
y = lr_df['Survived'] # Needs to model only with respect to the survived column
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, y, test_size = 0.20, random_state = 0)
logreg = LogisticRegression(solver='lbfgs', max_iter=1000) # Calling the logreg function from sklearn
logreg.fit(X_train_lr, y_train_lr) # Determining best fit of the logistic regression from training data (which is the same 80% of passengers as above)
Y_pred_lr = logreg.predict(X_test_lr) # Predicting Y values depending on test set
acc_log = logreg.score(X_test_lr, y_test_lr) # Calculating accuracy
print(f'Logistic Regression Accuracy: {round(acc_log*100,3)}%')
# Confusion Matrix
# Based from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
sns.heatmap(confusion_matrix(y_test,Y_pred_lr),annot=True,fmt='3.0f',cmap="Blues")
plt.title('Logistical Regression Confusion Matrix', y=1.05, size=15)
plt.tight_layout()
plt.savefig('Confusion_Matrix_lr.png')
if show_figures==1:
plt.show()
plt.close()
# Data Analysis Visualization
print(logreg.intercept_) # Gives the intercept of the best fit logistic regression equation found by the algorithm
print(logreg.coef_) # Gives the coefficients of the best fit logistic regression equation for each feature found by the algorithm
plt.plot(X_test, Y_pred_lr, 'o'); # Plotting the data points predicted
plt.tight_layout()
plt.savefig('Plot_lr.png')
if show_figures==1:
plt.show()
plt.close()
# Feature Importance for Logistic Regression
# Based on: https://machinelearningmastery.com/calculate-feature-importance-with-python/ but mainly from Anish's previous research experience
importance = logreg.coef_[0] # Get feature importance values from algorithm
plt.bar([r'Pclass', r'Male', r'Age', r'SibSp', r'Parch', r'Fare', r'Family Size', r'Embarked_C', r'Embarked_Q', r'Embarked_S', r'Title_Master', r'Title_Miss', r'Title_Mr', r'Title_Mrs', r'Title_Rare'], importance) # Plot feature importance for logistic regression algorithm
plt.xticks(rotation=45)
plt.tick_params(axis='x', labelsize = 12)
plt.tick_params(axis='y', labelsize = 15)
plt.grid(False)
plt.xlabel('Features', fontsize = 20)
plt.tight_layout()
fig.savefig('feature_importance_lr.png', bbox_inches='tight', dpi=400);
if show_figures==1:
plt.show()
plt.close()
## ML Algorithm 4: K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier
knn_df = combined_df.copy()
X = knn_df.drop(['Survived'], axis=1)
y = knn_df['Survived']
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, y, test_size = 0.20, random_state = 0)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_knn, y_train_knn)
Y_pred_knn = knn.predict(X_test_knn)
acc_knn = knn.score(X_test_knn, y_test_knn)
print(f'k-Nearest Neighbors Accuracy: {round(acc_knn*100,3)}%')
# Confusion Matrix
# Based from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
sns.heatmap(confusion_matrix(y_test,Y_pred_knn),annot=True,fmt='3.0f',cmap="Blues")
plt.title('k-Nearest Neighbors Confusion Matrix', y=1.05, size=15)
plt.tight_layout()
plt.savefig('Confusion_Matrix_knn.png')
if show_figures==1:
plt.show()
plt.close()
# Feature Importance for k-Nearest Neighbors
# Based on https://machinelearningmastery.com/calculate-feature-importance-with-python/
# k-Nearest Neighbors is one of the algorithms that does not support feature importance or feature selection natively
##################################################################################### Post-processing
## Model Evaluation
print('\n')
models = pd.DataFrame({
'Model': ['Decision Trees', 'Random Forest', 'Logistic Regression', 'k-Nearest Neighbors'],
'Score': [acc_dt, acc_rf, acc_log, acc_knn]}) # Tag each ML algorithm with its respective accuracy score
models.sort_values(by='Score', ascending=False) # Sort by descending score
print(models)