-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassification_project.py
241 lines (176 loc) · 9.2 KB
/
classification_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import random
random.seed(16333323) # first thing is initialize random with n number
import pandas as pd
colors = ["#000000", "#006400", "#ff0000", "#ffd700", "#00ff00", "#e9967a", "#00ffff", "#0000ff", "#6495ed", "#ff1493"] #colors for visualizations
import numpy as np
from sklearn.metrics import classification_report #for seeing performance of model
import nltk #for analyzing artist names and song titles
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stops = set(
stopwords.words("english")
) | set(["'d", "'ll", "'re", "'s", "'ve", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would'])
# stopwords are filler words like and, is, that, no semantic data
#we add contractions not in the original set
from sklearn.feature_extraction.text import TfidfVectorizer
#NLP technique to turn lists of strings into word occurence matrices
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.decomposition import TruncatedSVD
#To do PCA for sparse matrices
def word_vectors(train_data, test_data, num_components): # turns lists of strings into tf-idf-vectors
vectorizer = TfidfVectorizer(
decode_error="replace",
strip_accents="unicode",
tokenizer=nltk.word_tokenize, #splits text into tokens
stop_words=stops, #ignores these
)
tf_idfs_train = vectorizer.fit_transform(train_data)
vocab = vectorizer.vocabulary_ #all words in training
vectorizer = TfidfVectorizer(
decode_error="replace",
strip_accents="unicode",
tokenizer=nltk.word_tokenize,
stop_words=stops,
vocabulary=vocab, #this is the main difference, we only use words we found in training data to avoid leakage and keep everything consistent
)
tf_idfs_test = vectorizer.fit_transform(test_data)
clf = TruncatedSVD(n_components=num_components) #PCA but for sparse matrices
tf_idfs_train = clf.fit_transform(tf_idfs_train)
tf_idfs_test = clf.transform(tf_idfs_test)
#finds most relevant features of word occurences, makes word vectors with dimension num_components
return tf_idfs_train, tf_idfs_test
def setup(file_path, pca_components): #imports data and preprocesses it
data = pd.read_csv(file_path, encoding='utf-8')
keys = sorted([x for x in set(data["key"]) if isinstance(x, str)])
modes = sorted([x for x in set(data["mode"]) if isinstance(x, str)])
genres = sorted([x for x in set(data["music_genre"]) if isinstance(x, str)])
# get lists of values for categorical variables
data = data.dropna()
data = data.drop(data[data.tempo == "?"].index)
data = data.drop(data[data.duration_ms == -1].index)
# drop bad values
data['tempo'] = pd.to_numeric(data['tempo'])
#make this into numbers
for key in keys:
if key != key: continue # if nan ignore
data[key] = [int(x == key) for x in data["key"]]
# add new column, one hot encoding
for mode in modes:
if mode != mode: continue #if nan ignore
data[mode] = [int(x == mode) for x in data["mode"]]
# add new column, one hot encoding
# dont decode genre, we can deal with categorical outputs since classification problem
from sklearn.model_selection import train_test_split
X, y = data.drop(columns=["obtained_date", "key", "mode", "music_genre"]), data[["music_genre"]]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X.iloc[:,3:] = scaler.fit_transform(X.iloc[:,3:])
#ignores instanceid, artist name, track name since these arent numerical
#normalizes numerical data with z scoring
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
#stratify makes sure test data has equal class distribution
y_train = np.array(y_train).reshape(-1)
y_test = np.array(y_test).reshape(-1)
artists_train = x_train["artist_name"]
artists_test = x_test["artist_name"]
titles_train = x_train["track_name"]
titles_test = x_test["track_name"]
x_train = x_train.drop(columns=["artist_name", "track_name", "instance_id"])
x_test = x_test.drop(columns=["artist_name", "track_name", "instance_id"])
#remove non numerical data
artists_train = np.array(artists_train)
artists_test = np.array(artists_test)
titles_train = np.array(titles_train)
titles_test = np.array(titles_test)
title_vector_train, title_vector_test = word_vectors(titles_train, titles_test, pca_components)
x_train = np.concatenate((x_train, title_vector_train), axis=1)
x_test = np.concatenate((x_test, title_vector_test), axis=1)
artists_vector_train, artists_vector_test = word_vectors(artists_train, artists_test, pca_components)
x_train = np.concatenate((x_train, artists_vector_train), axis=1)
x_test = np.concatenate((x_test, artists_vector_test), axis=1)
return (x_train, x_test, y_train, y_test, genres)
def run_model(model, params, x_train, y_train, x_test, y_test, genres, show_plot=False):
clf = model(**params)
clf.fit(x_train, y_train)
y_hat = clf.predict(x_test)
probs = clf.predict_proba(x_test) #vector of class probabilities
print(model)
model_score = clf.score(x_test, y_test)
#accuracy score
fprs, tprs, scores = [],[],[]
#roc data and roc_auc scores
for idx, genre in enumerate(genres): #finds all roc_auc scores with one versus rest approach
y_hat_ovr = np.array([y[idx] for y in probs]) #uses probabilities for roc
#print(y_hat_ovr[:100])
y_test_ovr = np.array([y == genre for y in y_test])
fpr, tpr, _ = roc_curve(y_test_ovr, y_hat_ovr)
score = roc_auc_score(y_test_ovr, y_hat_ovr)
fprs.append(fpr)
tprs.append(tpr)
scores.append(score)
roc_auc = (np.mean(scores)) #weighted average slightly more accurate but all classes have almost exact same # data points
print(classification_report(y_test, y_hat))
print("ROC AUC Scores")
for genre, score in zip(genres, scores):
print("%s: %.3f" % (genre, score))
if(show_plot): #shows plot of ROC Curves
for (genre, fpr, tpr, score, color) in zip(genres, fprs, tprs, scores, colors):
plt.plot(fpr, tpr, color=color, lw=2, label=f'%s: area = %.3f' % (genre, score))
#plots roc curve for each genre
plt.plot([0, 1], [0, 1], 'k--', lw=2)
#plots dotted line in middle of roc chart
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('%s ROC Curves' % model)
plt.legend(loc="lower right")
plt.savefig("C:\\Users\\abhil\\Documents\\School\\Spring 2023\\Machine Learning\\Capstone Project\\ROC AUC Curves.png",
format='png',dpi=300)
plt.clf()
return model_score, roc_auc #accuracy score and roc auc score
def hyper_tune(model, hypers, params, x_train, y_train, x_test, y_test):
# find ideal hyper params, hypers are hyperparams to tune, params are params not tuned
from sklearn.model_selection import GridSearchCV
clf = model(**params)
hyper_param_tuning = GridSearchCV(clf, hypers, verbose=3)
hyper_param_tuning.fit(x_train, y_train)
print(hyper_param_tuning.best_params_)
def show_scatterplot(x_data, y_data, file_path, plot_title, coloring, labels_):
from matplotlib.lines import Line2D #for matplotlib aesthetics
scatter = plt.scatter(x_data, y_data, c = coloring, alpha=0.5)
plt.xlabel("Embedding 1")
plt.ylabel("Embedding 2")
plt.title(plot_title)
handles_ = [Line2D([0], [0], marker='o', color='w', label=genre_,
markerfacecolor=color_, markersize=5) for color_, genre_ in zip(colors, labels_)]
plt.legend(handles=handles_, labels=labels_, title="Genres", loc='upper center', bbox_to_anchor=(1, 0.5), fancybox=True, shadow=True)
plt.savefig(file_path, format='png',dpi=300)
plt.clf()
def main():
from sklearn.ensemble import HistGradientBoostingClassifier #classifier used
from sklearn.manifold import TSNE #dim reduce method
(x_train, x_test, y_train, y_test, genres ) = setup("C:\\Users\\abhil\\Documents\\School\\Spring 2023\\Machine Learning\\Capstone Project\\musicData.csv", pca_components=20)
#gets preprocessed data
from sklearn.decomposition import PCA
clf = PCA(n_components = 2)
clf.fit(x_train)
pca_embedding = clf.transform(x_test)
#does dim reduction on data
coloring = [colors[genres.index(g)] for g in y_test]
show_scatterplot(pca_embedding[:,0], pca_embedding[:,1], "C:\\Users\\abhil\\Documents\\School\\Spring 2023\\Machine Learning\\Capstone Project\\PCA Embedding.png", "PCA 2-D Embedding", coloring, genres)
#pca visualization
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
clf = LDA(solver='svd', n_components = 2)
clf.fit(x_train, y_train)
lda_embedding = clf.transform(x_test)
show_scatterplot(lda_embedding[:,0], lda_embedding[:,1], "C:\\Users\\abhil\\Documents\\School\\Spring 2023\\Machine Learning\\Capstone Project\\LDA Embedding.png", "LDA 2-D Embedding", coloring, genres)
#lda visualization
clf = TSNE(n_components = 2, perplexity=200,init='pca')
tsne_embedding = clf.fit_transform(x_test)
#does dim reduction on data
show_scatterplot(tsne_embedding[:,0], tsne_embedding[:,1], "C:\\Users\\abhil\\Documents\\School\\Spring 2023\\Machine Learning\\Capstone Project\\TSNE Embedding.png", "TSNE 2-D Embedding", coloring, genres)
m_score, r_score = run_model(HistGradientBoostingClassifier, {'loss': 'auto', 'learning_rate': 1e-1, 'max_leaf_nodes': 31, 'tol': 1e-7},x_train, y_train, x_test, y_test, genres, show_plot=True)
#runs model and gets results, shows plot
print(f'Accuracy: {m_score}, ROC_AUC Score: {r_score}')
if __name__ == "__main__": main()