-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data_Prep.py
212 lines (169 loc) · 7.56 KB
/
Data_Prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# data preparation file. Not a script
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import precision_score, recall_score
def load_data():
pokemon_df = pd.read_csv("Natdex_Data.csv")
return pokemon_df
def create_viability(tier: str):
'''
When applied to pokemon_df['tier'], will return a column that represents
whether the pokemon is good in OU or not.
Parameters:
tier (str): must be one of the recognized tiers. Exception will be
thrown otherwise
Returns:
bool: true for viable, false for not
'''
viable_list = ['OU', 'Uber', 'AG']
# extra entries are added to unviable_list to future proof against tiers
# that currently have no pokemon
unviable_list = ['UUBL', 'UU', 'RUBL', 'RU', 'NFE', 'LC', 'LC Uber',
'(OU)', '(UU)', '(RU)']
if (tier in viable_list):
return True
elif (tier in unviable_list):
return False
else:
raise Exception("Unidentified Tier")
def data_pipeline(pokemon_df, drop_prevos=True, kind='minmax', valid_size=0.1,
full_data=False, get_scaler=False):
'''
Preps the data from Natdex_Data.csv. It does this by
1. loading the data from the csv file
2. creating the target feature 'isViable'
3. ordinally encoding the boolean columns, including the target feature
4. creating dummy features for the 18 types in 'type1'
5. manually setting the values in these columns for 'type2'
6. manually removing the outliers of Shedinja, Slaking, and Regigigas
7. scaling the numerical statistical data according to the parameter 'kind'
I chose to encode type2 this way because type1 and type2 are inherently
similar, meaning they should be encoded in the same values
Parameters
----------
pokemon_df: pd.DataFrame
the dataframe containing the raw data from Natdex_Data.csv
drop_prevos: boolean
if True, drops all the pre-evolutions from the dataset
kind: string: 'minmax' or 'normal', default='minmax'
determine how to scale the numerical data: 'minmax' scales the data
in a minmax fashion using sklearn.preprocessing.MinMaxScaler, while
'normal' us sklearn.preprocessing.StandardScaler to scale and normalize
the data
valid_size: float
determine what proportion of the data set will be kept aside for
validation
full_data:
Returns
-------
X, y, X_valid, y_valid: tuple
X: pd.DataFrame
the features of the pokemon dataset, scaled, encoded, and with
outliers removed
y: pd.Series
the target feature of the pokemon dataset
X_valid: pd.DataFrame
a portion of the dataset that is held aside for final validation
y_valid: pd.Series
the labels for X_valid
'''
#add the feature-engineered target tier
pokemon_df['isViable'] = pokemon_df['tier'].apply(create_viability)
#drop the pre-evolutions according to the parameter
if (drop_prevos):
pokemon_df = pokemon_df[pokemon_df['isFinal']]
#ordinally-encode alternate, isLegend, isFinal, and isViable
pokemon_df['alternate'] = pd.factorize(pokemon_df['alternate'])[0]
pokemon_df['isLegend'] = pd.factorize(pokemon_df['isLegend'])[0]
pokemon_df['isFinal'] = pd.factorize(pokemon_df['isFinal'])[0]
pokemon_df['isViable'] = pd.factorize(pokemon_df['isViable'])[0]
#manually encode type1 and type2 (onehot encoding with the intersection)
type_list = ['bug', 'dark', 'dragon', 'electric', 'fairy', 'fighting', 'fire',
'flying', 'ghost', 'grass', 'ground', 'ice', 'normal', 'poison',
'psychic', 'rock', 'steel', 'water']
temp = pd.get_dummies(pokemon_df['type1'])
pokemon_df = pd.concat([pokemon_df, temp], axis='columns')
for thistype in type_list:
pokemon_df.loc[pokemon_df['type2'] == thistype, thistype] = 1
#drop unneeded columns
DROP_COLS = ['name', "ability1", 'ability2', 'hiddenability', 'isFinal', 'tier', 'type1', 'type2']
pokemon_df = pokemon_df.drop(labels=DROP_COLS, axis=1)
#Remove outliers
#dropping the outliers of shedinja, regigas, slaking
pokemon_df = pokemon_df.drop(291, axis=0)
pokemon_df = pokemon_df.drop(485, axis=0)
pokemon_df = pokemon_df.drop(288, axis=0)
#Scale the data according to the parameter
if (kind == 'minmax'):
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(copy=True)
elif (kind == 'normal'):
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True)
scale_columns = ['hp', 'atk', 'physdef', 'spatk', 'spdef', 'speed', 'bst']
pokemon_df[scale_columns] = scaler.fit_transform(pokemon_df[scale_columns])
#split into features and target
X = pokemon_df.drop('isViable', axis=1)
y = pokemon_df['isViable']
#split into validation and usage data
if (get_scaler):
return X, y, scaler
if (full_data):
return X, y
from sklearn.model_selection import train_test_split
X, X_valid, y, y_valid = train_test_split(X, y, test_size=valid_size,
stratify=y)
return X, X_valid, y, y_valid
def test_model(model, X, y, print_info=False):
'''
Using the inputted model: this function
1. uses StratifiedKFold cross validation with 3 splits to ensure accuracy
2. calculates the f1 score of each split
3. prints the model type as well as the average f1 score for all splits
4. returns the list of the f1 scores for each fold
Parameters
----------
model : any sklearn model
The model that is being tested. As long as it uses the .fit() and
.predict() methods, it will work fine with this function
X : pd.DataFrame
The features of the dataset. Must be numerical and ideally scaled to
work with all model types
y : pd.Series
The labels for X
print_info : bool
if True, prints model information to system console. default is false
Returns
-------
list
a python list containing the f1 score of each fold
'''
#precision and recall used because of imbalanced data
f1scores = []
acc_scores = []
precisionscores = []
recallscores = []
skf = StratifiedKFold(n_splits=5)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1score = f1_score(y_test, y_pred)
f1scores.append(f1score)
acc_score = accuracy_score(y_test, y_pred)
acc_scores.append(acc_score)
prec_score = precision_score(y_test, y_pred)
precisionscores.append(prec_score)
rec_score = recall_score(y_test, y_pred)
recallscores.append(rec_score)
if (print_info):
print(type(model))
print("Mean f1 score:", np.mean(f1scores))
print("Mean accuracy score:", np.mean(acc_scores))
print("Mean Precision score:", np.mean(precisionscores))
print("Mean recall score:", np.mean(recallscores))
return {'f1':np.mean(f1scores), 'acc':np.mean(acc_scores),
'prec':np.mean(precisionscores), 'rec':np.mean(recallscores)}