-
Notifications
You must be signed in to change notification settings - Fork 0
/
var2.R
127 lines (87 loc) · 3.28 KB
/
var2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# var2.R
# Name: Stephen Ling
# Email: [email protected]
# README: this Rscript tests 2 vars
# Input: train.csv, test.csv
# Output: 2.txt
# ip + app + device + os + channel
# preparation
rm(list=ls())
library(mlr3verse)
library(imbalance)
# take cmd arg
algorithm <- readLines("alg.txt")
data.temp <- read.csv("data2.csv")
data.temp$is_attributed <- as.factor(data.temp$is_attributed)
data.temp <- oversample(
data.temp,
ratio = 0.8,
method = "SMOTE",
filtering = F,
classAttr = "is_attributed")
var.comb <- function(alg, t) {
# prepare data
i <- c(1, 2, 6)
data <- t[,i]
if (alg == "logistic") {
task <- TaskClassif$new("stat479_logi", data , target = "is_attributed")
learner_logi <- lrn("classif.log_reg")
learner_logi$param_set$ids(tags = "threads")
set_threads(learner_logi, n = 7)
set.seed(1) # set seed to make sure each algorithm get same train & test
train_set <- sample(task$row_ids, 0.001 * task$nrow)
test_set <- setdiff(task$row_ids, train_set)
learner_logi$train(task,row_ids = train_set)
pred_logi <- learner_logi$predict(task,row_ids = test_set)
cm <- pred_logi$confusion
score <- (cm[2,2] / (cm[2,1] + cm[2,2]))
return(score);
}
if (alg == "naiveBayes") {
task <- TaskClassif$new("stat479_nb", data , target = "is_attributed")
learner_nb <- lrn("classif.naive_bayes")
learner_nb$param_set$ids(tags = "threads")
set_threads(learner_nb, n = 7)
set.seed(1) # set seed to make sure each algorithm get same train & test
train_set <- sample(task$row_ids, 0.001 * task$nrow)
test_set <- setdiff(task$row_ids, train_set)
learner_nb$train(task,row_ids = train_set)
pred_nb <- learner_nb$predict(task,row_ids = test_set)
cm <- pred_nb$confusion
score <- (cm[2,2] / (cm[2,1] + cm[2,2]))
return(score);
}
if (alg == "xgboost") {
task <- TaskClassif$new("stat479_xg", data , target = "is_attributed")
learner_xg <- lrn("classif.xgboost")
learner_xg$param_set$ids(tags = "threads")
set_threads(learner_xg, n = 7)
set.seed(1) # set seed to make sure each algorithm get same train & test
train_set <- sample(task$row_ids, 0.001 * task$nrow)
test_set <- setdiff(task$row_ids, train_set)
learner_xg$train(task,row_ids = train_set)
pred_xg <- learner_xg$predict(task,row_ids = test_set)
cm <- pred_xg$confusion
score <- (cm[2,2] / (cm[2,1] + cm[2,2]))
return(score);
}
if (alg == "randomForest") {
task <- TaskClassif$new("stat479_rf", data , target = "is_attributed")
learner_rf <- lrn("classif.ranger")
learner_rf$param_set$ids(tags = "threads")
set_threads(learner_rf, n = 7)
set.seed(1) # set seed to make sure each algorithm get same train & test
train_set <- sample(task$row_ids, 0.001 * task$nrow)
test_set <- setdiff(task$row_ids, train_set)
learner_rf$train(task,row_ids = train_set)
pred_rf <- learner_rf$predict(task,row_ids = test_set)
cm <- pred_rf$confusion
score <- (cm[2,2] / (cm[2,1] + cm[2,2]))
return(score);
}
}
accuracy <- var.comb(algorithm, data.temp)
result <- toString(accuracy)
file.name <- file("2.txt")
writeLines(result, file.name)
close(file.name)