forked from ovh/summit2016-RankingPredict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep8_xgboost.R
100 lines (76 loc) · 2.61 KB
/
step8_xgboost.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
library(dplyr)
library(xgboost)
library(readr)
library(stringr)
library(caret)
library(car)
library(clusterSim)
library(Ckmeans.1d.dp)
library(pROC)
library(SDMTools)
library(openssl)
library(httr)
set.seed(123)
datasetXg <- read.csv2("./dataset/dataset-cleaned.csv",stringsAsFactors=FALSE,header = TRUE)
#cleansing
datasetXg <- filter(datasetXg, Visiblis_Page>0 & Visiblis_Title>0 )
#datasetXg$Https <- as.list(download_ssl_cert(datasetXg$URL, 443))$subject
datasetXg$Https <- 0
datasetXg$Https[which(grepl("https",datasetXg$URL))] <- 1
datasetXg$EV <- 0
indexHttps <- which(grepl("https",datasetXg$URL))
source("./step9_getInfoSSL.R")
for (i in 1:length(indexHttps)) {
host <- parse_url(datasetXg$URL[i])$hostname
print(host)
datasetXg$EV <- detectSSL(host)
}
#create training dataset
datasetMat <- dplyr::select(datasetXg
#,Text.Ratio
,Word.Count
,Response.Time
,TrustFlow
,CitationFlow
,Visiblis_Page
,Visiblis_Title
#,Inlinks
#,Outlinks
#,Title.1.Length
#,H1.1.length
,RefDomains
,ExtBackLinks
,Https
,EV
)
datasetMat$Visiblis_Page <- as.numeric(datasetMat$Visiblis_Page)
datasetMat$Visiblis_Title <- as.numeric(datasetMat$Visiblis_Title)
datasetMat[is.na(datasetMat)] <- 0
#normalization
datasetMat <- data.Normalization(datasetMat, type="n1", normalization="column")
## 75% of the sample size
smp_size <- floor(0.75 * nrow(datasetMat))
train_ind <- sample(seq_len(nrow(datasetMat)), size = smp_size)
X <- datasetMat[train_ind, ]
X_test <- datasetMat[-train_ind, ]
y<- datasetXg[train_ind, "label"]
y_test<-datasetXg[-train_ind, "label"]
model <- xgboost(data = data.matrix(X[,-1]),
label = y,
eta = 0.1,
max_depth = 10,
verbose=1,
nround = 1000,
objective = "binary:logistic",
nthread = 8
)
y_pred <- predict(model, data.matrix(X_test[,-1]))
#y_pred <- matrix(y_pred, nrow=length(y_test), ncol=2, byrow=T)
#sum(y_test)
#sum(abs(y_pred - y_test))
#sum(abs( y_pred-0.5>0 -y_test))
p1 <- plot(roc(y_test, y_pred))
importance_matrix <- xgb.importance(names(X), model = model)
p2 <- xgb.plot.importance(importance_matrix)
conf <- confusionMatrix(y_test, as.integer(y_pred>0.9))
print(conf)