-
Notifications
You must be signed in to change notification settings - Fork 49
/
Module-4-Example-10.R
78 lines (51 loc) · 1.85 KB
/
Module-4-Example-10.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# setwd("SET THE Working Director to THE PATH TO THIS DIRECTORY")
library(car)
data <- read.csv("Datasets/CEO_salary.csv")
attach(data)
# Divid the Salary by 1000 to be able to better show the values.
salary1 <- salary/1000
# Now we create a New Data frame out of age, heit and Salary
data1 <- data.frame(age, height, salary1)
# Fit a Multiple Linear Regression model into data.
# Variables are Salary and age
lm(formula = salary1 ~ age + height)
# or better store the resulted model into a variable for further use.
m <- lm(salary1~age+height)
# See documentation of plot.lm
# https://www.rdocumentation.org/packages/stats/versions/3.6.1/topics/plot.lm
par(mfrow = c(2,2))
plot(m, ask=F)
# Good resource for understanding the lm plots
# https://data.library.virginia.edu/diagnostic-plots/
# Here we do some advanced regression diagnostics.
par(mfrow = c(1,1))
#
# https://www.statmethods.net/stats/rdiagnostics.html
# Assessing Outliers
outlierTest(m) # Bonferonni p-value for most extreme obs
qqPlot(m, main="QQ Plot") #qq plot for studentized resid
leveragePlots(m) # leverage plots
# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(data1)-length(m$coefficients)-2))
plot(m, which=4, cook.levels=cutoff)
# Influence Plot
influencePlot(m, id.method="identify", main="Influence Plot",
sub="Circle size is proportial to Cook's Distance" )
# Normality of Residuals
# qq plot for studentized resid
qqPlot(m, main="QQ Plot")
# distribution of studentized residuals
library(MASS)
sresid <- studres(m)
hist(sresid, freq=FALSE,
main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(fit)
# plot studentized residuals vs. fitted values
spreadLevelPlot(fit)