forked from matthewjdenny/GERGM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparallel_gergm.Rd
285 lines (241 loc) · 14.7 KB
/
parallel_gergm.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/parallel_gergm.R
\name{parallel_gergm}
\alias{parallel_gergm}
\title{A Function to estimate a number of GERGMs in parallel, each with its own equation.}
\usage{
parallel_gergm(formula_list, observed_network_list,
covariate_data_list = NULL, network_data_list = NULL, cores = 1,
normalization_type = c("log", "division"), network_is_directed = TRUE,
use_MPLE_only = FALSE, transformation_type = c("Cauchy", "LogCauchy",
"Gaussian", "LogNormal"), estimation_method = c("Gibbs", "Metropolis"),
maximum_number_of_lambda_updates = 10,
maximum_number_of_theta_updates = 10,
number_of_networks_to_simulate = 500, thin = 1, proposal_variance = 0.1,
downweight_statistics_together = TRUE, MCMC_burnin = 100, seed = 123,
convergence_tolerance = 0.01, MPLE_gain_factor = 0,
acceptable_fit_p_value_threshold = 0.05, force_x_theta_updates = 1,
force_x_lambda_updates = 1, output_directory = NULL, output_name = NULL,
generate_plots = TRUE, verbose = TRUE,
hyperparameter_optimization = FALSE, stop_for_degeneracy = FALSE,
target_accept_rate = 0.25, theta_grid_optimization_list = NULL,
beta_correlation_model = FALSE, weighted_MPLE = FALSE,
fine_grained_pv_optimization = FALSE, parallel = FALSE,
parallel_statistic_calculation = FALSE, cores_per_model = 1,
use_stochastic_MH = FALSE, stochastic_MH_proportion = 0.25, ...)
}
\arguments{
\item{formula_list}{A list of formula objects that specifies the relationship
between statistics and the observed network for each gergm. See the gergm()
documentation for more details.}
\item{observed_network_list}{A list of observed networks (as numeric matrices
to be used with each specification).}
\item{covariate_data_list}{An optional list of covariate data frames (may
include NULL entries if no covariates are needed in some specifications)}
\item{network_data_list}{An optional list of of lists of network covariates
to be included in each specification (one list per specification -- may also be left NULL for some specifications). THe list object corresponding to each
specification must have entries for network covariates named as they appear
in the corresponding equation. For example if the user specified a
'netcov(distance)' term, the corresponding list object for that specification
would need a $distance entry containing the corresponding matrix object.}
\item{cores}{The number of cores to be used for parallelization.}
\item{normalization_type}{If only a raw_network is provided the function
will automatically check to determine if all edges fall in the [0,1] interval.
If edges are determined to fall outside of this interval, then a trasformation
onto the interval may be specified. If "division" is selected, then the data
will have a value added to them such that the minimum value is atleast zero
(if necessary) and then all edge values will be divided by the maximum to
ensure that the maximum value is in [0,1]. If "log" is selected, then the data
will have a value added to them such that the minimum value is atleast zero
(if necessary), then 1 will be added to all edge values before they are logged
and then divided by the largest value, again ensuring that the resulting
network is on [0,1]. Defaults to "log" and need not be set to NULL if
providing covariates as it will be ignored.}
\item{network_is_directed}{Logical specifying whether or not the observed
network is directed. Default is TRUE.}
\item{use_MPLE_only}{Logical specifying whether or not only the maximum pseudo
likelihood estimates should be obtained. In this case, no simulations will be
performed. Default is FALSE.}
\item{transformation_type}{Specifies how covariates are transformed onto the
raw network. When working with heavly tailed data that are not strictly
positive, select "Cauchy" to transform the data using a Cauchy distribution.
If data are strictly positive and heavy tailed (such as financial data) it is
suggested the user select "LogCauchy" to perform a Log-Cauchy transformation
of the data. For a tranformation of the data using a Gaussian distribution,
select "Gaussian" and for strictly positive raw networks, select "LogNormal".
The Default value is "Cauchy".}
\item{estimation_method}{Simulation method for MCMC estimation. Default is
"Gibbs" which will generally be faster with well behaved networks but will not
allow for exponential downweighting.}
\item{maximum_number_of_lambda_updates}{Maximum number of iterations of outer
MCMC loop which alternately estimates transform parameters and ERGM
parameters. In the case that data_transformation = NULL, this argument is
ignored. Default is 10.}
\item{maximum_number_of_theta_updates}{Maximum number of iterations within the
MCMC inner loop which estimates the ERGM parameters. Default is 100.}
\item{number_of_networks_to_simulate}{Number of simulations generated for
estimation via MCMC. Default is 500.}
\item{thin}{The proportion of samples that are kept from each simulation. For
example, thin = 1/200 will keep every 200th network in the overall simulated
sample. Default is 1.}
\item{proposal_variance}{The variance specified for the Metropolis Hastings
simulation method. This parameter is inversely proportional to the average
acceptance rate of the M-H sampler and should be adjusted so that the average
acceptance rate is approximately 0.25. Default is 0.1.}
\item{downweight_statistics_together}{Logical specifying whether or not the
weights should be applied inside or outside the sum. Default is TRUE and user
should not select FALSE under normal circumstances.}
\item{MCMC_burnin}{Number of samples from the MCMC simulation procedure that
will be discarded before drawing the samples used for estimation.
Default is 100.}
\item{seed}{Seed used for reproducibility. Default is 123.}
\item{convergence_tolerance}{Threshold designated for stopping criterion. If
the difference of parameter estimates from one iteration to the next all have
a p -value (under a paired t-test) greater than this value, the parameter
estimates are declared to have converged. Default is 0.01.}
\item{MPLE_gain_factor}{Multiplicative constant between 0 and 1 that controls
how far away the initial theta estimates will be from the standard MPLEs via
a one step Fisher update. In the case of strongly dependent data, it is
suggested to use a value of 0.10. Default is 0.}
\item{acceptable_fit_p_value_threshold}{A p-value threshold for how closely
statistics of observed network conform to statistics of networks simulated
from GERGM parameterized by converged final parameter estimates. Default value
is 0.05.}
\item{force_x_theta_updates}{Defaults to 1 where theta estimation is not
allowed to converge until thetas have updated for x iterations . Useful when
model is not degenerate but simulated statistics do not match observed network
well when algorithm stops after first y updates.}
\item{force_x_lambda_updates}{Defaults to 1 where lambda estimation is not
allowed to converge until lambdas have updated for x iterations . Useful when
model is not degenerate but simulated statistics do not match observed network
well when algorithm stops after first y updates.}
\item{output_directory}{The directory where you would like output generated
by the GERGM estimation proceedure to be saved (if output_name is specified).
This includes, GOF, trace, and parameter estimate plots, as well as a summary
of the estimation proceedure and an .Rdata file containing the GERGM object
returned by this function. May be left as NULL if the user would prefer all
plots be printed to the graphics device.}
\item{output_name}{The common name stem you would like to assign to all
objects output by the gergm function. Default value of NULL will not save any
output directly to .pdf files, it will be printed to the console instead. Must
be a character string or NULL. For example, if "Test" is supplied as the
output_name, then 4 files will be output: "Test_GOF.pdf", "Test_Parameter_Estim
ates.pdf", "Test_GERGM_Object.Rdata", "Test_Estimation_Log.txt", and
"Test_Trace_Plot.pdf". Must be the same length as the number of specifications
or specification_i will be automatically used to distinguich between
specifications.}
\item{generate_plots}{Defaults to TRUE, if FALSE, then no diagnostic or
parameter plots are generated.}
\item{verbose}{Defaults to TRUE (providing lots of output while model is
running). Can be set to FALSE if the user wishes to see less output.}
\item{hyperparameter_optimization}{Logical indicating whether automatic
hyperparameter optimization should be used. Defaults to FALSE. If TRUE, then
the algorithm will automatically seek to find an optimal burnin and number of
networks to simulate, and if using Metropolis Hasings, will attempt to select
a proposal variance that leads to a acceptance rate within +-0.05 of
target_accept_rate. Furthermore, if degeneracy is detected, the algorithm
will attempt to adress the issue automatically. WARNING: This feature is
experimental, and may greatly increase runtime. Please monitor console
output!}
\item{stop_for_degeneracy}{When TRUE, automatically stops estimation when
degeneracy is detected, even when hyperparameter_optimization is set to TRUE.
Defaults to FALSE. SPECIFY SINGLE VALUE, MUST BE CONSTANT ACROSS SPECIFICATIONS.}
\item{target_accept_rate}{The target Metropolis Hastings acceptance rate.
Defaults to 0.25}
\item{theta_grid_optimization_list}{Defaults to NULL. This highly
experimental feature may allow the user to address model degeneracy arising
from a suboptimal theta initialization. It performs a grid search around the
theta values calculated via MPLE to select a potentially improved
initialization. The runtime complexity of this feature grows exponentially in
the size of the grid and number of parameters -- use with great care. This
feature may only be used if hyperparameter_optimization = TRUE, and if a list
object of the following form is provided: list(grid_steps = 2,
step_size = 0.5, cores = 2, iteration_fraction = 0.5). grid_steps indicates
the number of steps out the grid search will perform, step_size indicates the
fraction of the MPLE theta estimate that each grid search step will change by,
cores indicates the number of cores to be used for parallel optimization, and
iteration_fraction indicates the fraction of the number of MCMC iterations
that will be used for each grid point (should be set less than 1 to speed up
optimization). In general grid_steps should be smaller the more structural
parameters the user wishes to specify. For example, with 5 structural
parameters (mutual, ttriads, etc.), grid_steps = 3 will result in a (2*3+1)^5
= 16807 parameter grid search. Again this feature is highly experimental and
should only be used as a last resort (after playing with exponential
downweighting and the MPLE_gain_factor). SPECIFY SINGLE VALUE, MUST BE
CONSTANT ACROSS SPECIFICATIONS.}
\item{beta_correlation_model}{Defaults to FALSE. If TRUE, then the beta
correlation model is estiamted. A correlation network must be provided, but
all covariates and undirected statistics may be supplied as normal. SPECIFY
SINGLE VALUE, MUST BE CONSTANT ACROSS SPECIFICATIONS.}
\item{weighted_MPLE}{Defaults to FALSE. Should be used whenever the user is
specifying statistics with alpha downweighting. Tends to provide better
initialization when downweight_statistics_together = FALSE. SPECIFY SINGLE
VALUE, MUST BE CONSTANT ACROSS SPECIFICATIONS.}
\item{fine_grained_pv_optimization}{Logical indicating whether fine grained
proposal variance optimization should be used. This will often slow down
proposal variance optimization, but may provide better results. Highly
recommended if running a correlation model. SPECIFY SINGLE VALUE, MUST BE
CONSTANT ACROSS SPECIFICATIONS.}
\item{parallel}{Logical indicating whether the weighted MPLE objective and any
other operations that can be easily paralllelized should be calculated in
parallel. Defaults to FALSE. If TRUE, a significant speedup in computation
may be possible. SPECIFY SINGLE VALUE, MUST BE CONSTANT ACROSS
SPECIFICATIONS.}
\item{parallel_statistic_calculation}{Logical indicating whether network
statistics should be calculated in parallel. This will tend to be slower for
networks with les than ~30 nodes but may provide a substantial speedup for
larger networks. SPECIFY SINGLE VALUE, MUST BE CONSTANT ACROSS SPECIFICATIONS.}
\item{cores_per_model}{Numeric value defaulting to 1. Can be set to any
number up to the number of threads/cores available on your machine. Will be
used to speed up computations if parllel = TRUE. Note that this will be the
number of croes requested by EACH model, so plan accordingly! SPECIFY SINGLE
VALUE, MUST BE CONSTANT ACROSS SPECIFICATIONS.}
\item{use_stochastic_MH}{A logical indicating whether a stochastic approximation
to the h statistics should be used under Metropolis Hastings in-between
thinned samples. This may dramatically speed up estimation. Defualts to FALSE.
HIGHLY EXPERIMENTAL! SPECIFY SINGLE VALUE, MUST BE CONSTANT ACROSS
SPECIFICATIONS.}
\item{stochastic_MH_proportion}{Percentage of dyads/triads to use for
approximation, defaults to 0.25. SPECIFY SINGLE VALUE, MUST BE CONSTANT
ACROSS SPECIFICATIONS.}
\item{...}{Optional arguments, currently unsupported.}
}
\value{
A list of gergm objects for each model specified.
}
\description{
Allows the user to run multiple specifications at once in
parallel. All varaibles (excluding formula_list, observed_network_list,
covariate_data_list, network_data_list, cores and generate_plots) be be
either specified as a single value or as a vector of values equal to the
length of formula_list, if the user wishes to use different values for each
specification.
}
\examples{
\dontrun{
set.seed(12345)
net <- matrix(runif(100,0,1),10,10)
colnames(net) <- rownames(net) <- letters[1:10]
node_level_covariates <- data.frame(Age = c(25,30,34,27,36,39,27,28,35,40),
Height = c(70,70,67,58,65,67,64,74,76,80),
Type = c("A","B","B","A","A","A","B","B","C","C"))
rownames(node_level_covariates) <- letters[1:10]
network_covariate <- net + matrix(rnorm(100,0,.5),10,10)
network_data_list <- list(network_covariate = network_covariate)
formula <- net ~ edges + sender("Age") +
netcov("network_covariate") + nodematch("Type",base = "A")
formula2 <- net ~ edges +
netcov("network_covariate") + nodemix("Type",base = "A")
form_list <- list(f1 = formula,
f2 = formula2)
testl <- parallel_gergm(formula_list = form_list,
observed_network_list = net,
covariate_data_list = node_level_covariates,
network_data_list = network_data_list,
cores = 2,
number_of_networks_to_simulate = 10000,
thin = 1/100,
proposal_variance = 0.1,
MCMC_burnin = 5000)
}
}