-
Notifications
You must be signed in to change notification settings - Fork 1
/
MachineLearning.R
123 lines (100 loc) · 5.93 KB
/
MachineLearning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# New performance sampling, alle experiments in one file
require(caret)
require(randomForest)
require(pROC)
require(doMC)
require(rjson)
nExperiments = 100
nCores = 20
registerDoMC(cores = nCores)
#TODO: Identify which feature set you are using, for the experiments with the provenance and the predicate features.
## Guney reference set
#PredColumns = c(2:142)
#ProvColumns = c(143:213)
# EMC reference set
PredColumns = c(2:149)
ProvColumns = c(150:227)
# General code
## Read in the data
f = read.csv2("Feature set.csv", stringsAsFactors = F)
input = f
reference_set = read.csv2("Reference set.csv", stringsAsFactors = F)
colnames(reference_set) = c("Drug", "DrugTargets")
## Calculate the number of drug targets per drug, and add this information as a feature
reference_set$nDT = sapply(reference_set$DrugTargets, function(x){length(unique(fromJSON(x)))})
input = merge(input, unique(reference_set[,c("Drug", "nDT")]), by = "Drug")
## Separate the dataset into the features and the none-features
removalColumns = c("Drug", "Disease", "diseaseProtCount", "nDT")
dd_data = input[,which(colnames(input) %in% removalColumns)]
input = input[,-which(colnames(input) %in% removalColumns)]
## Function for sampling of the instances
sample_count = length(which(input$goldstandard == "VALID"))
createBalancedSet = function(df, n, pos_class = "VALID"){
balanced = rbind(df[sample(row.names(df[df$goldstandard == pos_class, ]), n),],
df[sample(row.names(df[!df$goldstandard == pos_class, ]), n),])
return(balanced)}
# Create the data frame which is used as output
out = data.frame( auc_complete = numeric(nExperiments),
auc_oneTarget = numeric(nExperiments),
auc_twoTarget = numeric(nExperiments),
auc_threeTarget = numeric(nExperiments),
auc_indirect_only = numeric(nExperiments),
auc_preds_only = numeric(nExperiments),
auc_prov_only = numeric(nExperiments),
auc_noFeatures = numeric(nExperiments))
## Calculate the performances
for(i in 1:nrow(out)){
print(i)
registerDoMC(cores = nCores)
input_sample = createBalancedSet(input, sample_count)
dd_sample = input_sample[,which(colnames(input_sample) %in% removalColumns)]
dd_sample$rowIndex = 1:nrow(input_sample)
input_sample = input_sample[,-which(colnames(input_sample) %in% removalColumns)]
# Complete dataset
model = train(goldstandard ~., data = input_sample, method = "rf",
trControl=trainControl(method="cv",number=10, classProbs = TRUE,
savePredictions = TRUE, summaryFunction = twoClassSummary),
prox=TRUE, allowParallel=TRUE, metric = "ROC")
out$auc_complete[i] = model$results$ROC[model$results$mtry == model$bestTune$mtry]
# See where the drugs with 1, 2, or 3 or more drugtargets are classified
cv_results = model$pred[model$pred$mtry == model$bestTune$mtry, ]
cv_results = merge(cv_results[,c("obs", "VALID", "rowIndex")], dd_sample[,c("nDT", "rowIndex")], by = "rowIndex")
out$auc_oneTarget[i] = as.numeric(roc(cv_results$obs[cv_results$nDT == 1], cv_results$VALID[cv_results$nDT == 1])$auc)
out$auc_twoTarget[i] = as.numeric(roc(cv_results$obs[cv_results$nDT == 2], cv_results$VALID[cv_results$nDT == 2])$auc)
out$auc_threeTarget[i] = as.numeric(roc(cv_results$obs[cv_results$nDT > 2], cv_results$VALID[cv_results$nDT > 2])$auc)
# Remove the direct relationships and measure performance
input_nobias = input_sample[,-grep("direct_", colnames(input_sample))]
input_nobias = input_nobias[,-grep("selfref_", colnames(input_nobias))]
input_nobias$overlap = NULL
model = train(goldstandard ~., data = input_nobias, method = "rf",
trControl=trainControl(method="cv",number=10, classProbs = TRUE,
savePredictions = TRUE, summaryFunction = twoClassSummary),
prox=TRUE, allowParallel=TRUE, metric = "ROC")
out$auc_indirect_only[i] = model$results$ROC[model$results$mtry == model$bestTune$mtry]
# Remove the provenances and train
input_noProv = input_sample[,-ProvColumns]
model = train(goldstandard ~., data = input_noProv, method = "rf",
trControl=trainControl(method="cv",number=10, classProbs = TRUE,
savePredictions = TRUE, summaryFunction = twoClassSummary),
prox=TRUE, allowParallel=TRUE, metric = "ROC")
out$auc_preds_only[i] = model$results$ROC[model$results$mtry == model$bestTune$mtry]
# Remove the predicates and train
input_noPred = input_sample[,-PredColumns]
model = train(goldstandard ~., data = input_noPred, method = "rf",
trControl=trainControl(method="cv",number=10, classProbs = TRUE,
savePredictions = TRUE, summaryFunction = twoClassSummary),
prox=TRUE, allowParallel=TRUE, metric = "ROC")
out$auc_prov_only[i] = model$results$ROC[model$results$mtry == model$bestTune$mtry]
# Create only "Associated with" relationships and train
direct_cols = grep("direct_", colnames(input_sample))
indirect_cols = grep(c("step1_|step2_"), colnames(input_sample))
input_sample$direct_associated_with = ifelse(rowSums(input_sample[,direct_cols]) > 0, TRUE, FALSE)
input_sample$indirect_associated_with = ifelse(rowSums(input_sample[,indirect_cols]) > 0, TRUE, FALSE)
input_sample = input_sample[,c("overlap", "direct_associated_with", "indirect_associated_with", "goldstandard")]
model = train(goldstandard ~., data = input_sample, method = "rf",
trControl=trainControl(method="cv",number=10, classProbs = TRUE,
savePredictions = TRUE, summaryFunction = twoClassSummary),
prox=TRUE, allowParallel=TRUE, metric = "ROC")
out$auc_noFeatures[i] = model$results$ROC[model$results$mtry == model$bestTune$mtry]
}
write.csv2(out, "Results crossvalidation experiments.csv", row.names = F)