-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEdx Data Science Capstone project.R
552 lines (430 loc) · 21.6 KB
/
Edx Data Science Capstone project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
# Edx Data Science Capstone project
#############################################################
# Create edx set, validation set, and submission file
#############################################################
# Note: this process could take a couple of minutes
library(tidyverse)
library(caret)
library(lubridate)
# MovieLens 10M dataset:
# https://grouplens.org/datasets/movielens/10m/
# http://files.grouplens.org/datasets/movielens/ml-10m.zip
dl <- tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)
ratings <- read.table(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
col.names = c("userId", "movieId", "rating", "timestamp"))
movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")
movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(levels(movieId))[movieId],
title = as.character(title),
genres = as.character(genres))
movielens <- left_join(ratings, movies, by = "movieId")
# Validation set will be 10% of MovieLens data
set.seed(1)
test_index <- createDataPartition(y = movielens$rating, times = 1, p = 0.1, list = FALSE)
edx <- movielens[-test_index,]
temp <- movielens[test_index,]
# Make sure userId and movieId in validation set are also in edx set
validation <- temp %>%
semi_join(edx, by = "movieId") %>%
semi_join(edx, by = "userId")
# Add rows removed from validation set back into edx set
removed <- anti_join(temp, validation)
edx <- rbind(edx, removed)
rm(dl, ratings, movies, test_index, temp, movielens, removed)
## Save a copy to be loaded so don't need to run above code
save(edx, file = "C:/Users/jerem/Documents/GitHub/EdxCapstone/edx.Rda")
save(validation, file = "C:/Users/jerem/Documents/GitHub/EdxCapstone/validation.Rda")
#Load in the Edx and Validation datasets
load("C:/Users/jerem/Documents/GitHub/EdxCapstone/edx.Rda")
load("C:/Users/jerem/Documents/GitHub/EdxCapstone/validation.Rda")
#Create a table that has the number of movies with certain ratings
ratings <- edx %>% group_by(rating) %>% tally() %>% mutate(Percent = round(((n/sum(n)) * 100),1) ) %>% select(rating, Percent)
#Plot of ratings
edx %>% ggplot(aes(x = rating)) +
geom_histogram(binwidth = 0.5) + ggtitle("Number of Ratings") +
ylab("Number of Ratings") + xlab("Rating") + scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0), limits = c(0,5))
#How many different movies? - 10,677
nrow(distinct(edx, movieId))
#Plot the number of ratings per movie
edx %>% group_by(movieId) %>% tally() %>% ggplot(aes(x = n)) +
geom_histogram(binwidth = 10) + ggtitle("Number of Ratings Per Movie") +
ylab("Number of Movies") + xlab("Number of Ratings") + scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0), limits = c(1,750))
#How many unique users? - 69,787
nrow(distinct(edx, userId))
#Plot the number of ratings per user
edx %>% group_by(userId) %>% tally() %>% ggplot(aes(x = n)) +
geom_histogram(binwidth = 10) + ggtitle("Number of Ratings Per User") +
ylab("Number of Users") + xlab("Number of Ratings") + scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0), limits = c(0,2000))
#Need to find the number for different genres. Genres are sometimes combined
# so use str_detect to find the genres within longer strings
table(str_detect(edx$genres, "Drama"))
table(str_detect(edx$genres, "Comedy"))
table(str_detect(edx$genres, "Thriller"))
table(str_detect(edx$genres, "Romance"))
# Which movie has the greatest number of ratings?
# Using tally look at counts and then the top ten with the most ratings
edx %>% group_by(title) %>% tally %>% top_n(10, n) %>% arrange(desc(n))
### Instructions:: You will use the following code to generate your datasets.
# Develop your algorithm using the edx set. For a final test of your algorithm,
# predict movie ratings in the validation set as if they were unknown.
# RMSE will be used to evaluate how close your predictions are to the true values in the validation set.
#So train the model on the EdX set then test on validation
### Cleaning
#Check for missing values - none found
sum( is.na(edx$rating) )
sum( is.na(edx$genres) )
sum( is.na(edx$date) )
#The movie title won't be used in the model, remove that and
edx <- edx %>% select(-title)
#See if there is a relationship between timestamp and number of ratings
edx <- mutate(edx, date = as_datetime(timestamp))
edx %>% mutate(date = round_date(date, unit = "week")) %>%
group_by(date) %>%
summarize(rating = mean(rating)) %>%
ggplot(aes(date, rating)) +
geom_point() +
geom_smooth(span = 0.15)
#No relationship so drop timestamp
edx <- edx %>% select(-timestamp, -date)
#### Create new variabless
#Examine impact of genres on rating
edx %>% group_by(genres) %>%
summarize(n = n(), avg = mean(rating), se = sd(rating)/sqrt(n())) %>%
filter(n >= 40000) %>%
mutate(genres = reorder(genres, avg)) %>%
ggplot(aes(x = genres, y = avg, ymin = avg - 2*se, ymax = avg + 2*se)) +
geom_point() +
geom_errorbar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
##Convert the genres variables to dummies
# What are the individual genres, not combined, create table to find them
#Filter this table so only genres without the | in them are included
genres <- edx %>% mutate(nc = nchar(genres)) %>% group_by(genres,nc) %>%
tally %>% arrange(nc) %>% mutate(s = str_detect(genres, "[|]")) %>% filter(s == FALSE)
#Look at the unique individual genres
genres$genres
edx$Drama <- str_detect(edx$genres,"Drama")
edx$War <- str_detect(edx$genres,"War")
edx$IMAX <- str_detect(edx$genres,"IMAX")
edx$Crime <- str_detect(edx$genres,"Crime")
edx$Action <- str_detect(edx$genres,"Action")
edx$Comedy <- str_detect(edx$genres,"Comedy")
edx$Horror <- str_detect(edx$genres,"Horror")
edx$SciFi <- str_detect(edx$genres,"Sci-Fi")
edx$Fantasy <- str_detect(edx$genres,"Fantasy")
edx$Musical <- str_detect(edx$genres,"Musical")
edx$Mystery <- str_detect(edx$genres,"Mystery")
edx$Romance <- str_detect(edx$genres,"Romance")
edx$Western <- str_detect(edx$genres,"Western")
edx$Children <- str_detect(edx$genres,"Children")
edx$Thriller <- str_detect(edx$genres,"Thriller")
edx$Adventure <- str_detect(edx$genres,"Adventure")
edx$Animation <- str_detect(edx$genres,"Animation")
edx$FilmNoir <- str_detect(edx$genres,"Film-Noir")
edx$Documentary <- str_detect(edx$genres,"Documentary")
#Delete the original genres variable
edx <- edx %>% select(-genres)
### Create regularized user and movie means
## First create graphics of the distribution of average ratings for movies and users
edx %>% group_by(movieId) %>% summarize(mu_m = mean(rating)) %>% ggplot(aes(x = mu_m)) +
geom_histogram(binwidth = 0.1) + ggtitle("Average Ratings by Movie") +
scale_y_continuous(expand = c(0,0)) + ylab("Number of Movies") + xlab("Average Movie Rating")
edx %>% group_by(userId) %>% summarize(mu_u = mean(rating)) %>% ggplot(aes(x = mu_u)) +
geom_histogram(binwidth = 0.1) + ggtitle("Average Ratings by User") +
scale_y_continuous(expand = c(0,0)) + ylab("Number of Users") + xlab("Average User Rating")
#Create an RMSE function
RMSE <- function(true_ratings, predicted_ratings){ sqrt(mean((true_ratings - predicted_ratings)^2 )) }
#Penalized Least Squares
#Lambda is a tuning parameter, determine the best one
#Create a sequence of lambdas
lambdas <- seq(0, 0.75, 0.05)
#Apply this function which calculates the movie and then user effects using the different lambdas
rmses <- sapply(lambdas, function(l){
mu <- mean(edx$rating)
#Create the movie effect by calculating the mean rating per movie but subtracting the overall mean
b_i <- edx %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu)/(n()+l))
#Create the user effect by calculating the mean rating for user but subtracting the overall mean
# and the movie effect
b_u <- edx %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu)/(n()+l))
#Create predicted ratings using the movie and user effects and the overall mean
predicted_ratings <-
edx %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = mu + b_i + b_u) %>%
pull(pred)
#Calculate the RMSE between the prediced ratings and the true rating and store that in a vector
return(RMSE(edx$rating, predicted_ratings))
})
#Plot the RMSE using different lambdas
qplot(lambdas, rmses)
#Return the lambda with the lowest RMSE
lambdas[which.min(rmses)]
## Lowest RMSE is when lamda is 0.5, so use that to create the movie and user effect variables
l <- lambdas[which.min(rmses)]
l
#Create a measure of the overall mean
mu <- mean(edx$rating)
#Create the movie effect by calculating the mean rating per movie but subtracting the overall mean
b_i <- edx %>% group_by(movieId) %>%summarize(b_i = sum(rating - mu)/(n()+l))
#Create the user effect by calculating the mean rating for user but subtracting the overall mean
# and the movie effect
b_u <- edx %>% left_join(b_i, by="movieId") %>% group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu)/(n()+l))
#Combine the movie and user effects back into the overall dataset but rename it so old version kept
train <- edx %>% left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId")
#Remove these dataframes as they are no longer needed
rm(b_i, b_u)
# Check for near zero variance.
nzv <- nearZeroVar(train)
nzv
##: Columns with zero variance:6,13,16,21,22
#IMAX, Musical, Western, FilmNoir, Documentary
# Remove the columns with zero variance, these are all genre dummies
train <- train[ -c(6,13,16,21,22) ]
#Save a version of this train dataset
save(train, file = "C:/Users/jerem/Documents/GitHub/EdxCapstone/train.Rda")
#Create a smaller version of the train ds to speed up the models
set.seed(1996)
#Sample 100,000 rows
train_sub <- train[sample(nrow(train), 100000),]
#Use that to create an index of the sampled rows, just 20% of the sample
test_index <- createDataPartition(y = train_sub$rating, times = 1, p = 0.2, list = FALSE)
#Create a train dataframe with 80% of the rows
train_set <- train_sub[-test_index,]
#Create a test dataset with the other 20%
test_set <- train_sub[test_index,]
#Do a semi_join to ensure that all movies and users in test set are in training set
test_set <- test_set %>% semi_join(train_set, by = "movieId") %>% semi_join(train_set, by = "userId")
#Remove the ID columns
test_set <- test_set %>% select(-userId, -movieId)
train_set <- train_set %>% select(-userId, -movieId)
## Try a linear regression
#First fit the model
fit <- lm(rating ~ ., data = train_set)
#Use the fitted model to predict ratings values in test_set
y_hat <- predict(fit, newdata = test_set)
#Now see the root mean square error: 0.8486
rmse_lm <- RMSE(test_set$rating, y_hat)
#Doublecheck the RMSE value
sqrt( mean((y_hat - test_set$rating)^2 ) )
#Create a table of RMSEs for the different models
rmse_results <- data_frame(method = "Linear Regression", RMSE = rmse_lm)
#################################################################################
### Try a Penalized Ridge Regression Model
#Create movie and user means for ridge regression, not regularized
b_i <- edx %>% group_by(movieId) %>% summarize(b_i = mean(rating) )
b_u <- edx %>% group_by(userId) %>% summarize(b_u = mean(rating) )
edx_r <- edx %>% left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId")
#Create a smaller version of ridge regression datasets
set.seed(2001)
train_sub2 <- edx_r[sample(nrow(train), 200000),]
train_set2 <- train_sub2[-test_index,]
test_set2 <- train_sub2[test_index,]
#Do a semi_join to ensure that all movies in test set are in training set
test_set2 <- test_set2 %>% semi_join(train_set, by = "movieId") %>%
semi_join(train_set, by = "userId")
#Remove the ID columns
test_set2 <- test_set2 %>% select(-userId, -movieId)
train_set2 <- train_set2 %>% select(-userId, -movieId)
#Create a vector of the ratings for the ridge regression
edx_y <- train_set2$rating
testedx_y <- test_set2$rating
#Remove the ratings from the dataframe with just predictors
train_set2 <- train_set2 %>% select(-rating)
test_set2 <- test_set2 %>% select(-rating)
#Tune the lamda for the ridge regression
ridgeGrid <- data.frame(.lambda = seq(0, 0.7, length = 10) )
#Set the number of cross validation folds to 10
ctrl <- trainControl(method = "cv", number = 10)
ridgeRegFit <- train(train_set2, edx_y,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl,
#Scale the predictors
preProc = c("center","scale"))
#Lowest RMSE is when Lambda is 0.0
ridgeRegFit
train_ridge <- train(train_set2, edx_y,
method = "ridge",
lamda = 0.0,
trControl = ctrl,
#Scale the predictors
preProc = c("center","scale"))
## Now create the predicted Ys for the ridge regression
y_hat_ridge <- predict(train_ridge, test_set2, type = "raw")
#RMSE for regression trees 0.7526076
rmse_r <- RMSE(testedx_y,y_hat_ridge)
#Add this to the table of RMSE
rmse_results <- bind_rows(rmse_results,
data_frame(method = "Penalized Ridge Regression", RMSE = rmse_r ))
#################################################################################
### Regression trees
# Try with different levels of CP in the tuning grid
train_rpart <- train(rating ~ .,
method = "rpart",
tuneGrid = data.frame(cp = seq(0, 0.01, len = 10)),
data = train_set)
#Plot the tuning parameters
ggplot(train_rpart)
#Try smaller calues of cp since those have lower RMSE
train_rpart <- train(rating ~ .,
method = "rpart",
tuneGrid = data.frame(cp = seq(0, 0.001, len = 10)),
data = train_set)
ggplot(train_rpart)
train_rpart <- train(rating ~ .,
method = "rpart",
tuneGrid = data.frame(cp = seq(0, 0.0005, len = 20)),
data = train_set)
ggplot(train_rpart)
#Plot the regression trees
plot(train_rpart$finalModel, margin = 0.1)
text(train_rpart$finalModel, cex = 0.75)
#The plot is not very useful
train_rpart$finalModel
## cp at 0.00024 produces the lowest error
train_rpart <- train(rating ~ ., method = "rpart", cp = 0.00024, data = train_set)
## Now create the predicted Ys for the regression trees
y_hat_rpart <- predict(train_rpart, test_set, type = "raw")
#RMSE for regression trees
rmse_rt <- RMSE(test_set$rating, y_hat_rpart)
rmse_results <- bind_rows(rmse_results,
data_frame(method = "Regression Trees Model", RMSE = rmse_rt ))
#################################################################################
### Try a Random Forest Model
#Tune the number of nodes
train_rf_2 <- train(rating ~ ., method = "Rborist",
tuneGrid = data.frame(predFixed = 2, minNode = seq(1, 10, 1)),
data = train_set)
##The RMSE was high, lowest was 0.9164772 for minNode = 4
train_rf_2
##minNode RMSE Rsquared MAE
#1 0.9192283 0.2987140 0.7212660
#2 0.9183750 0.2993035 0.7204278
#3 0.9189227 0.2993747 0.7209477
#4 0.9182957 0.3001221 0.7204291
#5 0.9187032 0.2999568 0.7207166
#6 0.9187269 0.3005963 0.7209528
#7 0.9186815 0.3006666 0.7207485
#8 0.9186136 0.3008872 0.7206874
#9 0.9187092 0.3015820 0.7209318
#10 0.9184179 0.3019828 0.7209016
#Try to tune the number of trees
#mtry: Number of variables randomly sampled as candidates at each split, equivalent to predFixed
#Good rule of thumb for mtry is the number of predictors divided by 3, or 5 in this case.
#Keep the number of cross validations low to reduce run time
#verboseIter should track how long it takes
control <- trainControl(method = "cv", number = 2, p = .9)
train_rf <- train(train_set[,2:17], train_set$rating,
method = "rf",
tuneGrid = data.frame(mtry = c(3,5) ),
trControl = control,
verboseIter = TRUE)
train_rf
#No pre-processing
#Resampling: Cross-Validated (2 fold)
#Summary of sample sizes: 39998, 40001
#Resampling results across tuning parameters:
# mtry RMSE Rsquared MAE
#3 0.8695489 0.3370193 0.6763025
#5 0.8750665 0.3228978 0.6787266
#RMSE was used to select the optimal model using the smallest value.
#The final value used for the model was mtry = 3.
## Now create the predicted Ys for the random forest model
y_hat_rf <- predict(train_rf, test_set)
#RMSE for random forest where mtry = 3:
rmse_rf_mtry3 <- RMSE(test_set$rating,y_hat_rf)
rmse_results <- bind_rows(rmse_results,
data_frame(method = "Random Forest", RMSE = rmse_rf_mtry3 ))
#################################################################################
### Try KNN function
#This is for cross validation, number is the number of folds
control <- trainControl(method = "cv", number = 2, p = .9)
#First try this large range of Ks
train_knnh <- train(train_set[,2:17],train_set$rating,
method = "knn",
tuneGrid = data.frame(k = seq(20, 50, 10)),
trControl = control)
train_knnh
#No pre-processing
#Resampling: Cross-Validated (2 fold)
#Summary of sample sizes: 39998, 40001
#Resampling results across tuning parameters:
# k RMSE Rsquared MAE
#20 0.8879322 0.3019723 0.6896326
#30 0.8843840 0.3079554 0.6864500
#40 0.8841499 0.3095794 0.6865460
#50 0.8850363 0.3094689 0.6874967
#RMSE was used to select the optimal model using the smallest value.
#The final value used for the model was k = 40.
#Try k = 40, since it would be hard to optimize further without taking a lot of time
y_hat_knn <- predict(train_knnh, test_set[,2:17])
#Look at RMSE
rmse_knn <- RMSE(test_set$rating, y_hat_knn)
#Add the KNN rmse to the table
rmse_results <- bind_rows(rmse_results,
data_frame(method = "KNN", RMSE = rmse_knn ))
## Save the rmse_results table
save(rmse_results, file = "C:/Users/jerem/Documents/GitHub/EdxCapstone/RMSE_results.Rda")
##########################################################################################
## Since the Linear Regression Model had the lowest RMSE run that on the full dataset
# to get the final RMSE
### Examine RMSE for Full EdX and Validation datasets for linear regression
#First need to process the validation dataset so it has the same predictors as EdX
validation$Drama <- str_detect(validation$genres,"Drama")
validation$War <- str_detect(validation$genres,"War")
validation$IMAX <- str_detect(validation$genres,"IMAX")
validation$Crime <- str_detect(validation$genres,"Crime")
validation$Action <- str_detect(validation$genres,"Action")
validation$Comedy <- str_detect(validation$genres,"Comedy")
validation$Horror <- str_detect(validation$genres,"Horror")
validation$SciFi <- str_detect(validation$genres,"Sci-Fi")
validation$Fantasy <- str_detect(validation$genres,"Fantasy")
validation$Musical <- str_detect(validation$genres,"Musical")
validation$Mystery <- str_detect(validation$genres,"Mystery")
validation$Romance <- str_detect(validation$genres,"Romance")
validation$Western <- str_detect(validation$genres,"Western")
validation$Children <- str_detect(validation$genres,"Children")
validation$Thriller <- str_detect(validation$genres,"Thriller")
validation$Adventure <- str_detect(validation$genres,"Adventure")
validation$Animation <- str_detect(validation$genres,"Animation")
validation$FilmNoir <- str_detect(validation$genres,"Film-Noir")
validation$Documentary <- str_detect(validation$genres,"Documentary")
#Now delete the original genres variable
val <- validation %>% select(-genres)
#Regularize and scale the movie and user means as done on the training dataset above
#Use the lamda found above
l <- 0.5
mu <- mean(val$rating)
#Movie effect
b_i <- val %>% group_by(movieId) %>%summarize(b_i = sum(rating - mu)/(n()+l))
#User effect
b_u <- val %>% left_join(b_i, by="movieId") %>% group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu)/(n()+l))
#Add the movie and user effects to the validation dataset
val <- val %>% left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId")
#Drop the variables not used in the final model
val <- val %>% select(-timestamp, -title)
# Remove the columns with zero variance and the ID columns
##: Columns with zero variance:6,13,16,21,22
#IMAX, Musical, Western, FilmNoir, Documentary
val <- val[ -c(6,13,16,21,22) ]
val <- val %>% select(-userId, -movieId)
edx2 <- train %>% select(-userId, -movieId)
## Try a linear regression on full EdX dataset
fit2 <- lm(rating ~ ., data = edx2)
## Use the betas to predict ratings in the validation dataset
y_hat_final <- predict(fit2, newdata = val)
#Now see the root mean square error: 0.6816817
final_rmse <- RMSE(val$rating, y_hat_final)
print(final_rmse)