-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_regression_share.R
164 lines (84 loc) · 3.92 KB
/
linear_regression_share.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#--------Import the libraries--------------
# for impute function to replace NA
library(Hmisc)
# for scatter plot and correlation
library(GGally)
#for vif( variable inflation factor)
library(car)
# ------- read the csv data -----------------
mashable_data<-read.csv("OnlineNewsPopularity.csv", stringsAsFactors = FALSE)
#First 6 rows of the data
head(mashable_data)
#structure of the data to see sample data and datatype
str(mashable_data)
#summary of data (check for NA Value for Missing value treatment)
summary(mashable_data)
#-----------MIssing Value treatment-------------
# column n_tokens_content has NA values
summary(mashable_data$n_tokens_content)
# replace NA with mean impute function from hmisc library
mashable_data$n_tokens_content = impute(mashable_data$n_tokens_content, mean)
summary(mashable_data$n_tokens_content)
#------------Outliers treatment (Do for all variable)-----------
#look for outliers
#box plot sets the outlier statistics coef is 1.5 time IQR from Q3
# value above ((Q3 - Q1) * 1.5) + Q3 is outlier
outlier_values = boxplot.stats(mashable_data$num_hrefs)$out
# Hmisc puts a * on missing value treatment, to solve that use as.numeric
boxplot(as.numeric(mashable_data$num_hrefs), main="No of token")
head(outlier_values)
# take a new variable to manipulate
x <- mashable_data$n_tokens_content
# find Q1 = qnt[1] and Q3 = qnt[2]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
# find 5 percentile and 95 percentile caps[1] and caps[2]
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
#1.5 times of range
H <- 1.5 * IQR(x, na.rm = T)
#value below 1.5 times range from Q1 replaced by 5 percentile value
x[x < (qnt[1] - H)] <- caps[1]
#value aboe 1.5 times range from Q3 replaced by 95 percentile value
x[x > (qnt[2] + H)] <- caps[2]
# replacing the existing column
mashable_data$n_tokens_content<-x
#---------------- check correlation----------------
#ggpairs(data=mashable_data, columns=3:5, title="mashable data")
# creating a matrix of correlation
correlation_matrix = cor(mashable_data[,-1]) #removal of 1st categorical variable as correlation is applicable for numeric values
correlation_matrix
write.csv( correlation_matrix,'correlation.csv')
#---------------- multicollinearity -----------------
# first we will need to run the linear model
car_model<-lm(shares~ ., data=mashable_data[,-1])
#2 variables with very high multicollinearity. n_unique_tokens n_non_stop_unique_tokens
vif(car_model)
# removal of variables that have high multicollinearity
names(mashable_data)
car_model<-lm(shares~., data=mashable_data[,-c(1,5)])
vif(car_model)
#-------------- test and learn data ---------------
smp_size = 0.75 * nrow(mashable_data)
set.seed(123)
train_ind <- sample(seq_len(nrow(mashable_data)), size = smp_size)
train <- mashable_data[train_ind, ]
test <- mashable_data[-train_ind, ]
#--------------model creation----------
linearMod <- lm(shares ~ ., data=train[,-c(1,5)]) # build linear regression model on full data
# Check the model output
summary(linearMod)
names(mashable_data)
#---------Remove variable with low p value one by one VVIP ------------------
linearMod <- lm(shares ~ ., data=train[,-c(1,2,3,4,5,7,10,11,12,14,15,16,19,20,21, 22)])
# Check the model output
summary(linearMod)
# ------------predict target variable/ scoring ---------------------
# predict shares
shares_pred <- predict(linearMod, test)
head(shares_pred)
# ---------------- Evaluation/ Validation ----------------------------
actuals_preds <- data.frame(cbind(actuals=train$shares, predicteds=shares_pred)) # make actuals_predicteds dataframe.
correlation_accuracy <- cor(actuals_preds)
head(actuals_preds)
#Evaluation predictions
mape <- mean((abs((actuals_preds$predicteds - actuals_preds$actuals))/actuals_preds$actuals),na.rm = TRUE)
mape