-
Notifications
You must be signed in to change notification settings - Fork 0
/
iBGe.R
307 lines (219 loc) · 7.67 KB
/
iBGe.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# when we don't know the interventions and they just lead to a mean shift
local_run <- FALSE#TRUE
standardize <- TRUE # whether to standardise the data
# if we just want the data
just_data <- FALSE#TRUE
# if we want UT-IGSP evaluated
# (if needs to be run before in python, after making the data)
eval_SP <- TRUE
# if we only want a mean shift
only_shift <- FALSE
# to run locally we need to set the seed and interventions
# this is done externally on the cluster runs
if (local_run) {
kk <- 1
seed_number <- 101 # the seed
}
# load libraries
library(pcalg)
library(graph)
library(BiDAG)
# Use BiDAG with intervention scoring
insertSource("./usrscorefns.R", package = "BiDAG")
source("./Rfns/intfns.R") # load other functions
### Settings
# for 100 nodes takes up to an hour for MAP and MCMC
# with all settings for ROC curve
#scaleN <- 10 # normally 1
scaleN <- 1
n <- 100 # number of nodes
N <- 4*n*scaleN # number of observations
exp_ints <- scaleN*n*c(5, 10, 20)[kk]/100 # number of observations per intervention
exp_parents <- 2 # expected number of parents
ni <- 10 # number of possible interventions
# store values for later dataframe
setup_vec <- c(n, N, exp_ints, exp_parents, ni, seed_number)
names(setup_vec) <- c("n", "N", "ints", "parents", "ni", "seed")
# create a name for the directory to store stuff
f_name <- paste(paste(names(setup_vec), setup_vec, sep = "_"), collapse = "_")
subdir_name <- paste(paste(names(setup_vec[-6]), setup_vec[-6], sep = "_"), collapse = "_")
if (only_shift) {
if (standardize) {
top_dir_name <- "./simresults_s"
} else {
top_dir_name <- "./simresults_s_unscaled"
}
} else {
if (standardize) {
top_dir_name <- "./simresults_g"
} else {
top_dir_name <- "./simresults_g_unscaled"
}
}
dir_name <- paste(top_dir_name, subdir_name, sep = "/")
if (!dir.exists(top_dir_name)) { # create directory if none exists
dir.create(top_dir_name)
}
if (!dir.exists(dir_name)) { # create directory if none exists
dir.create(dir_name)
}
### Generate data
set.seed(seed_number) # set seed
# generate which intervention each node gets
#Ts <- c(rep(0, N*(1-exp_ints)),
# sort(sample.int(ni, N*exp_ints, replace = TRUE)))
Ts <- c(rep(0, N - exp_ints*ni),
rep(1:ni, each = exp_ints))
# can remove non-existent interventions,
# but we remove later anyway
#Ts <- rep(1:length(unique(Ts))-1, table(Ts))
# which nodes are targeted
Tlist <- vector("list", ni)
# how much mean shift they create
Tshifts <- rnorm(ni, 1, 0.1)
# how much they damp the contribution from parents
Tdamps <- runif(ni, min = 0.1) #rnorm(ni, 1, 0.1)
for (ii in 1:ni){
Tlist[[ii]] <- sample.int(n, 1+rpois(1, 1))
}
# generate intervention matrix
Tmat <- matrix(0, ncol = n, nrow = N)
Imat <- matrix(0, ncol = ni, nrow = N)
shift_vec <- rep(0, N)
damp_vec <- rep(0, N)
for (ii in 1:N){
if (Ts[ii] > 0) {
Tmat[ii, Tlist[[Ts[ii]]]] <- 1
Imat[ii, Ts[ii]] <- 1
shift_vec[ii] <- Tshifts[Ts[ii]]
damp_vec[ii] <- Tdamps[Ts[ii]]
}
}
# remove interventions that do not appear
# this should not happen in the current setting!
Imat <- Imat[, which(colSums(Imat) > 0)]
# turn intervention matrix into format for pcalg
# should be same as used elements of Tlist
# with a padded entry at the start for observational data
targetsmat <- mgcv::uniquecombs(Tmat)
if (is.null(nrow(targetsmat))) {
targets <- vector("list", 1)
targets[[1]] <- integer(0)
} else {
targets <- vector("list", nrow(targetsmat))
for (ii in 1:nrow(targetsmat)) {
targets[[ii]] <- which(targetsmat[ii,] == 1)
}
}
set.seed(seed_number) # set seed
# generate random DAG
trueDAGedges <- as(pcalg::randDAG(n = n, d = 2*exp_parents, wFUN = list(wFUN, 0.25, 1)), "matrix")
trueDAG <- 1*(trueDAGedges != 0)
trueEG <- DAG2EG(trueDAG, targets)
# generate simulated data
if (only_shift) {
data <- irmvDAG(trueDAGedges, Tmat, shift = shift_vec, scale = standardize)
} else {
data <- irmvDAG(trueDAGedges, Tmat, shift = shift_vec, damp = damp_vec, scale = standardize)
}
method_vec <- rep(NA, 3)
names(method_vec) <- c("method", "parameter", "value")
if(just_data) { # write files
if (!file.exists(paste0(dir_name, "/", f_name, "_data.csv"))) {
write.csv(data, paste0(dir_name, "/", f_name, "_data.csv"),
row.names = FALSE)
write.csv(Ts, paste0(dir_name, "/", f_name, "_Ts.csv"),
row.names = FALSE)
}
}
if (!just_data) {
### MAP and MCMC search
# fill out missing simulations
if (!file.exists(paste0(dir_name, "/", f_name, "_BiDAG.Rdata"))) {
result_df <- NULL
MAPtime <- 0
MCMCtime <- 0
### MAP search
MAP_ams <- signif(exp(c(-4:0*3/2, 1:4/2))/10, 3)
am_samp <- MAP_ams[5]
method_vec[2] <- "am"
# Start the clock!
ptm <- proc.time()
for (am_value in MAP_ams) {
method_vec[1] <- "MAP"
method_vec[3] <- am_value
scoreObject <- scoreparameters(scoretype = "usr", data = data, usrpar = list(pctesttype = "bge", Imat = Imat, am = am_value))
set.seed(seed_number) # set seed
bestDAGs <- iterativeMCMC(scoreObject, scoreout = TRUE, verbose = TRUE)
bestDAG <- bestDAGs$DAG
#(bestDAGscore <- bestDAGs$score)
#DAGscore(scoreObject, bestDAG)
if (ncol(Imat) > 0) {
bestDAG <- bestDAG[-c(1:ncol(Imat)), -c(1:ncol(Imat))]
}
## Turn DAG into essential graph
bestEG <- DAG2EG(bestDAG, targets)
MAPresult <- compareEGs(bestEG, trueEG)
result_df <- rbind(result_df, data.frame(t(c(setup_vec, method_vec, MAPresult))))
# Stop the clock
MAPtime <- MAPtime + (proc.time() - ptm)[1]
## Order sample
# Start the clock!
ptm <- proc.time()
set.seed(seed_number) # set seed
orderresult <- orderMCMC(scoreObject, startspace = bestDAGs$endspace,
MAP = FALSE, plus1 = TRUE, chainout = TRUE,
startorder = bestDAGs$maxorder,
iterations = 20*round(n*n*log(n)),
verbose = TRUE)
# Stop the clock
MCMCtime <- MCMCtime + (proc.time() - ptm)[1]
orderchain <- orderresult$traceadd$incidence
if (ncol(Imat) > 0) {
for (ii in 1:length(orderchain)) {
orderchain[[ii]] <- orderchain[[ii]][-c(1:ncol(Imat)), -c(1:ncol(Imat))]
}
}
posteriors <- EGedgePosterior(orderchain, targets = targets)
# Consensus graphs
method_vec[1] <- "MCMC"
MCMCEG <- consensusEG(posteriors, 0.5)
MCMCresult <- compareEGs(MCMCEG, trueEG)
result_df <- rbind(result_df, data.frame(t(c(setup_vec, method_vec, MCMCresult))))
}
time_df <- data.frame(n = n, N = N, ints = exp_ints,
parents = exp_parents, ni = ni, seed = seed_number,
method = c("MAP", "MCMC"),
time = c(MAPtime, MCMCtime))
save(result_df, time_df, file = paste0(dir_name, "/", f_name, "_BiDAG.Rdata"))
}
if (eval_SP) {
### SP
# fill out missing simulations
if (!file.exists(paste0(dir_name, "/", f_name, "_UT-IGSP.Rdata"))) {
result_df <- NULL
SP_DAGs <- NULL
if (file.exists(paste0(dir_name, "/", f_name, "_UT-IGSP_DAGs.csv"))) {
SP_DAGs <- read.csv(paste0(dir_name, "/", f_name, "_UT-IGSP_DAGs.csv"), header = FALSE)
}
method_vec[1:2] <- c("UT-IGSP", "alpha")
SP_alphas <- signif(exp(c(-4:0*3/2, 1:4/2))/1e5, 3)
if (!is.null(SP_DAGs)) {
for (ii in 1:length(SP_alphas)) {
alpha <- SP_alphas[ii]
method_vec[3] <- alpha
SP_fit <- SP_to_DAG(as.character(SP_DAGs[ii, ]))
if (is.null(SP_fit)) {
SPresult <- rep(NA, 5)
names(SPresult) <- c("TP", "FP", "SHD", "TPR", "FPR_P")
} else {
SPEG <- 1*as(dag2essgraph(as(SP_fit, "graphNEL"), targets), "matrix")
SPresult <- compareEGs(SPEG, trueEG)
}
result_df <- rbind(result_df, data.frame(t(c(setup_vec, method_vec, SPresult))))
}
save(result_df, file = paste0(dir_name, "/", f_name, "_UT-IGSP.Rdata"))
}
}
}
}