-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxgboost_example.R
55 lines (54 loc) · 1.66 KB
/
xgboost_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#
# replicate xgboost results from
# https://towardsdatascience.com/detect-parkinsons-with-10-lines-of-code-intro-to-xgboost-51a4bf76b2e6
#
# credit Priansh Shah
#
rm(list = ls())
#
library(xgboost)
#
data <- read.csv("parkinsons.csv")
#
# manipulate data for xgboost
#
status <- as.matrix(data$status)
data$status <- NULL
data <- as.matrix(data[, 2:ncol(data)])
scales <- numeric()
scale_factors <- c(-1, 1)
for (i in 1:ncol(data)) {
scales[i] <- max(data[, i]) - min(data[, i])
data[, i] <-
(((scale_factors[2] - scale_factors[1]) * (data[, i] - min(data[, i]))) /
scales[i]) + scale_factors[1]
}
#
# use 14% of data for test data per original article
#
train_indices <- sample(seq(1, nrow(data), 1), 0.86 * nrow(data), replace = FALSE)
train_data <- data[train_indices, ]
train_labels <- status[train_indices]
test_data <- data[-train_indices, ]
test_labels <- status[-train_indices]
#
# train the model and store it
#
model <- xgboost(data = train_data, label = train_labels, nrounds = 100)
#
train_predictions <- predict(model, newdata = train_data)
train_predictions[train_predictions < 0.5] <- 0
train_predictions[train_predictions > 0.5] <- 1
train_accuracy <-
100 * length(which(train_predictions == train_labels)) /
length(train_labels)
#
test_predictions <- predict(model, newdata = test_data)
test_predictions[test_predictions < 0.5] <- 0
test_predictions[test_predictions > 0.5] <- 1
test_accuracy <-
100 * length(which(test_predictions == test_labels)) /
length(test_labels)
#
cat("accuracy on test data = ", test_accuracy,
"\naccuracy on train data = ", train_accuracy)