-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCars_Script.R
132 lines (108 loc) · 4.45 KB
/
Cars_Script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# script for cars data analysis code
# Load data
library(ggplot2)
library(corrplot)
library(leaps)
data(mtcars)
head(mtcars)
# Light preprocessing
# get correlation matrix before turning everything into factors
car_corrs <- cor(mtcars)
# mat : is a matrix of data
# ... : further arguments to pass to the native R cor.test function
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(mtcars)
# create factors
mtcars_fac <- mtcars
mtcars_fac$cyl <- as.factor(mtcars_fac$cyl)
mtcars_fac$vs <- as.factor(mtcars_fac$vs)
mtcars_fac$am <- factor(mtcars_fac$am)
levels(mtcars_fac$am) <- c("auto", "man")
#mtcars_fac$trans <- as.factor(ifelse(mtcars_fac$am == "auto", 1, 0)) # auto=1, manual = 0
mtcars_fac$gear <- factor(mtcars_fac$gear)
mtcars_fac$carb <- factor(mtcars_fac$carb)
# Basic exploratory data analysis
# Plot MPG vs trans
g <- ggplot(mtcars_fac, aes(x=am, y=mpg, fill=am)) +
theme(legend.position="none"
, panel.background = element_rect(fill='grey')
, plot.background = element_rect(fill='darkseagreen')
, plot.title = element_text(hjust = 0.5)
) +
ggtitle('Mileage By Transmission Type') +
labs(x="Transmission Type", y="MPG") +
geom_violin(trim=TRUE) +
scale_fill_brewer(palette="Blues") +
geom_boxplot(width=0.05) +
geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = .5)
print(g)
# Plot MPG vs (trans x cyls)
g1 <- ggplot(mtcars_fac, aes(x=am, y=mpg, fill=am)) +
theme(legend.position="none"
, panel.background = element_rect(fill='grey')
, plot.background = element_rect(fill='darkseagreen')
, plot.title = element_text(hjust = 0.5)
) +
ggtitle('Mileage By Transmission Type and # Of Cylinders') +
labs(x="Transmission Type", y="MPG") +
facet_wrap(~cyl, nrow=1) +
geom_violin(trim=TRUE) +
scale_fill_brewer(palette="Blues") +
geom_boxplot(width=0.05) +
geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = .5)
print(g1)
# look at variable correlations to decide which variables to throw away
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(car_corrs, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, #Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = 0.01, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE
)
#corrplot(car_corrs, method="number")
# all subsets method of variable selection
best.subset <- regsubsets(mpg ~ ., mtcars, nvmax=10)
best.subset.summary <- summary(best.subset)
best.subset.summary$outmat
# results
# cyl disp hp drat wt qsec vs am gear carb
#1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
#2 ( 1 ) "*" " " " " " " "*" " " " " " " " " " "
#3 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " "
#4 ( 1 ) " " " " "*" " " "*" "*" " " "*" " " " "
#5 ( 1 ) " " "*" "*" " " "*" "*" " " "*" " " " "
# so tranny is only the 3rd important variable - weight being #1 and cyl/qsec (correlated) being #2
# so lets explore the smallest model with tranny type
best.subset.by.adjr2 <- which.max(best.subset.summary$adjr2)
best.subset.by.adjr2
best.subset.by.cp <- which.min(best.subset.summary$cp)
best.subset.by.cp
best.subset.by.bic <- which.min(best.subset.summary$bic)
best.subset.by.bic
par(mfrow = c(2, 2), oma = c( 0, 0, 2, 0 ))
plot(best.subset$rss, xlab="Number of Variables", ylab="RSS", type="l")
plot(best.subset.summary$adjr2, xlab="Number of Variables", ylab="Adjusted RSq", type="l")
points(best.subset.by.adjr2, best.subset.summary$adjr2[best.subset.by.adjr2], col="red", cex =2, pch =20)
plot(best.subset.summary$cp, xlab="Number of Variables", ylab="CP", type="l")
points(best.subset.by.cp, best.subset.summary$cp[best.subset.by.cp], col="red", cex =2, pch =20)
plot(best.subset.summary$bic, xlab="Number of Variables", ylab="BIC", type="l")
points(best.subset.by.bic, best.subset.summary$bic[best.subset.by.bic], col="red", cex =2, pch =20)
title( "Regression Variable Selection KPIs", outer = TRUE )
#coef(best.subset, 3)