-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_oneClassBinaryModel.R
131 lines (109 loc) · 5.72 KB
/
get_oneClassBinaryModel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env Rscript
#get_oneClassBinaryModel.R
source(paste(getwd(),"/collecting/collecting_layer.R",sep=""))
source(paste(getwd(),"/preprocessing/preprocessing_layer.R",sep=""))
source(paste(getwd(),"/modeling/modeling_layer.R",sep=""))
#Description:
#Retrieve triples doing SPARQL queries in order to train a binary model to diference resources
#to belong to a DBpedia class or not. Return the trained model with metrics.
#Input:
# - positiveClass (String)
# - numberPositiveCases (integer)
# - negativeClasses (List <String>)
# - numberNegativeCases (List <integer>)
# - numberOfRequest (integer)
# - URL's endpoint SPARQL (String)
# - queryLimit (integer), it would be wonderful have something like configuration settings, to give this values
# - domain_propertiesURI (string), optional, string filter to grep some specific properties, for instance, only belonging to DBpedia ontology
# - collected_data (List <List <data.frame>>), optional, in case to already have collected data, skip collection phase
# - preprocessed_data (data frame), optional, in case to already have prepared data, skip its respective phase
#Exceptions
# - NumberOfRequest must be an integer
# - Number of request must be greater than 1
#(if numberOfRequest<1 then throws ex)
#Return
# - Reference to each phase outputs (list with lists with several and heterogeneous objects)
# - Print input parameters and output metrics (accuracy, precision and recall, maybe f-measure too)
###################
#@@@ collecting; from SPARQL
#Do some SPARQL queries in order to obtain triples belonging to properties and resources with
#a positive class passed in a String as arguments
#Do some another SPARQL queries to obtain triples beloging to properties and resources with
#negative classes passed in a list as argument
# - Input: positive class, number of positive cases, negative classes list, number of negative cases, url endpoint
# - Return 4 data frames in a list with 3 columns each one, column names always "s","p","o"
# > Positive resources with types (resource, a, positiveClass)
# > Positive resources with properties (resource, p, nonImportantData)
# > Negative resources with types (resource, a, negativeClasses)
# > Negative resources with properties (resource, p, nonImportantData)
##################
#@@@ preprocessing;
#Transform triples properly to obtain a learning matrix
# - Input: 4 data frames with types and propertios about positive class and negative classes
# - properties are binary columns 1/0 (yes/no)
# - class column are 1 for positive class and 0 for the rest (negative classes)
# - Return data frame object, learningSet, save in hard disk?
#############
#@@@ modeling;
#Pass data to a Random Forest model, from H2O library
# - Input: 1 data frame, learningSet
# - Cross-Validation, 10 folds
# - Return model object with metrics (accuracy, precission, recall), save in hard disk?
get_oneClassBinaryModel <- function(positiveClass, numberPositiveCases,
negativeClasses, numberNegativeCases,
numberOfRequest, urlEndpoint, queryLimit,
nameModel, pathModel,
domain_propertiesURI=NULL,
collected_data=NULL, preprocessed_data=NULL#,randomSeed
){
if(!is.numeric(numberOfRequest)){
if(!numberOfRequest%%1==0){
stop(paste0("Error, numberOfRequest should be an integer: ",numberOfRequest), call.=FALSE)
}
}
if(numberOfRequest<1){
stop(paste0("Error, numberOfRequest should be greater than 0: ",numberOfRequest), call.=FALSE)
}
stackRequest <- vector("list",numberOfRequest)
for(i in 1:numberOfRequest){
#next code in "if" statement should be replaced by preprocessing_layer_prediction() when would be tested
if(is.null(collected_data) && is.null(preprocessed_data)){
print(paste0(Sys.time()," -> starting request number ",i))
print(paste0(Sys.time()," -> starting collection phase"))
#phase 1
data_collected <- collecting_layer(positiveClass, numberPositiveCases,
negativeClasses, numberNegativeCases,
urlEndpoint, queryLimit,domain_propertiesURI)
positive_types <- data_collected[[1]]
positive_properties <- data_collected[[2]]
negative_types <- data_collected[[3]]
negative_properties <- data_collected[[4]]
print(paste0(Sys.time()," -> starting preprocessing phase"))
#phase 2
learningSet <- preprocessing_layer(positive_types, positive_properties,
negative_types, negative_properties,domain_propertiesURI)
}else{
print("recovering saved data.....")
data_collected <- collected_data
learningSet <- preprocessed_data
print(paste0("learning set with ",nrow(learningSet)," resources as rows and ",
nrow(learningSet)-2,"properties as columns"))
print(paste0(nrow(learningSet[learningSet$Class == "1"]," resources with positive class (type)")))
}
print(paste0(Sys.time()," -> starting modeling phase"))
#phase 3
trainedModel <- modeling_layer(learningSet,nameModel,i,pathModel,1234)
#save iteration data and results
# aux_listStepCover <- vector("list",3)
# aux_listStepCover[[1]] <- data_collected
# aux_listStepCover[[2]] <- learningSet
# aux_listStepCover[[3]] <- trainedModel
aux_listStepCover <- list()
aux_listStepCover$data_collected <- data_collected
aux_listStepCover$learningSet <- learningSet
aux_listStepCover$trainedModel <- trainedModel
stackRequest[[i]] <- aux_listStepCover
print(paste0(Sys.time()," -> ending request number ",i))
}
return(stackRequest)
}