-
Notifications
You must be signed in to change notification settings - Fork 1
/
randomForest.py
145 lines (121 loc) · 6.11 KB
/
randomForest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 17 18:13:16 2021
@author: rapha
"""
# In depth analysis of random forest classification
from textvectorization import loadTfidfdataset
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
random_seed = 42
savefile = "wikidset_comb_1_basic"
def run_randomForest(savefile, vectorizer, ttsplit,
getFeatImportance = True, clean = True):
"""
run random forest, very similar to logistic regression
Parameters
----------
savefile : string
Folder from which to load and where to save data.
vectorizer : string
only sklearn supported.
ttsplit : number between 0 and 1
choose train test split.
getFeatImportance : Bool, optional
Should feature importance be calculated and plotted. The default is True.
clean : Bool, optional
Use lemmatized and stemmed data. The default is True.
Returns
-------
TYPE
score, feature importance data frame (only if feature importance was selected).
"""
texts_train, texts_test, labels_train, labels_test = loadTfidfdataset(
savefile, vectorizer = vectorizer, split = ttsplit, clean = clean)
classifier = RandomForestClassifier(random_state = random_seed).fit(texts_train, labels_train)
preds = classifier.predict(texts_test)
score = metrics.accuracy_score(labels_test, preds)
report = metrics.classification_report(labels_test, preds)
print(score)
if clean == True:
with open(savefile + "/" + vectorizer + "_tfidf_randFor_report_clean.txt", "w" ) as info:
info.write(report)
if getFeatImportance == True:
with open(savefile + "/feature_names_clean.pickle", 'rb') as handle:
features = pickle.load(handle)
importances = classifier.feature_importances_
std = np.std([
tree.feature_importances_ for tree in classifier.estimators_], axis=0)
df = pd.DataFrame(data = (zip(features, list(importances), list(std))),
columns = ["feature", "importance", "std"])
df.to_csv(savefile + "/RandForest_featImportance_clean.csv")
return score, df
else:
return score
else:
with open(savefile + "/" + vectorizer + "_tfidf_randFor_report.txt", "w" ) as info:
info.write(report)
if getFeatImportance == True:
with open(savefile + "/feature_names.pickle", 'rb') as handle:
features = pickle.load(handle)
importances = classifier.feature_importances_
std = np.std([
tree.feature_importances_ for tree in classifier.estimators_], axis=0)
df = pd.DataFrame(data = (zip(features, list(importances), list(std))),
columns = ["feature", "importance", "std"])
df.to_csv(savefile + "/RandForest_featImportance.csv")
return score, df
else:
return score
def feat_imp_plot_rf(featureImportance, savefile, plot_mostImportant = True, clean = True):
# plot feature importance distribution from dataframe, optionally plot most important features with mean and standard deviation
if clean:
clean_label = "clean"
else:
clean_label = ""
fig, ax = plt.subplots(figsize= (5,10))
fi = featureImportance["importance"]
# select features above 0.001 and 0.0001
above001 = np.sum(fi>0.001)
above0001 = np.sum(fi>0.0001)
mean = np.mean(fi)
mean_sd = np.mean(featureImportance["std"])
ax.boxplot(fi)
ax.axhline(0.001, color = "green")
ax.axhline(0.0001, color = "lightgreen")
with open(savefile + "/RandForest_featImp_stats_" + clean_label + ".txt", "w" ) as info:
info.write(savefile + "\nFeatsAbove0.001: {} \nFeatsAbove0.0001: {} \nMean: {} \n\
MeanStdDev {}".format(above001, above0001, mean, mean_sd))
plt.savefig(savefile + "/RandForest_featImpdistribution_" + clean_label + ".png")
if plot_mostImportant:
# plot features that are above 0.001
mifeatsplus001 = featureImportance[featureImportance["importance"] >0.001].sort_values(by = "importance", ascending = False)
mifeatsplus0001 = featureImportance[featureImportance["importance"] >0.0001].sort_values(by = "importance", ascending = False)
mifeatsplus001.to_csv(savefile + "/RandForest_mi_feats001_" + clean_label + ".csv")
mifeatsplus0001.to_csv(savefile + "/RandForest_mi_feats0001_" + clean_label + ".csv")
plusfeats = list(mifeatsplus001["feature"])
pluscoeff = list(mifeatsplus001["importance"])
plusstd = list(mifeatsplus001["std"])
forest_importances = pd.Series(pluscoeff, index=plusfeats)
fig, axs = plt.subplots(figsize= (10,8))
#Use MDI since Feature Permutation is too costly in terms of computation
#to be used on so many features
forest_importances.plot.bar(yerr=plusstd, ax=axs, color = "grey")
axs.set_title("Feature importances using MDI")
axs.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
fig.savefig(savefile + "/RandForest_featImpMDI_" + clean_label + ".png")
return 1
# Run for balanced dataset, cleaned (lemmatized and stemmed) and simply cleaned dataset
score, feat_imp = run_randomForest(savefile = "wikidset_comb_1_basic", vectorizer = "sklearn",
ttsplit = 0.2, clean = True)
feat_imp_plot_rf(feat_imp, savefile = "wikidset_comb_1_basic", plot_mostImportant = True,
clean = True)
score, feat_imp = run_randomForest(savefile = "wikidset_comb_1_basic", vectorizer = "sklearn",
ttsplit = 0.2, clean = False)
feat_imp_plot_rf(feat_imp, savefile = "wikidset_comb_1_basic", plot_mostImportant = True,
clean = False)