Skip to content

Commit

Permalink
stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
brunoguindani committed Jul 11, 2022
1 parent 712cc86 commit 9617049
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 144 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops -ftree-vectorize -Wno-deprecated")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops -fopenmp -ftree-vectorize -Wno-deprecated")
set(CMAKE_FIND_PACKAGE_PREFER_CONFIG TRUE)

# Require PkgConfig
Expand Down
285 changes: 142 additions & 143 deletions python/reproducibility/replicate_jss_paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
OUTPUT_PATH = "reproducibility"

ALGORITHMS = "Neal2 Neal3 Neal8 SplitMerge".split()
ALGORITHMS = ["Neal3"]
# ALGORITHMS = ["Neal3"]

ALGO_SETTINGS = """
algo_id: "{}"
Expand Down Expand Up @@ -159,13 +159,13 @@ def run_bayesmix(log_fold, dataset, name, g0_params, univariate: bool):
out_clus = defaultdict(dict)
g0_name = "NNIG" if univariate else "NNW"
for algo in ALGORITHMS:
# log_file = os.path.join(log_fold, 'bayesmix_{0}_{1}.log'.format(name, algo))
# with open(log_file, 'w') as f:
# with redirect_stdout(f):
out = run_mcmc(g0_name, "PY", dataset, g0_params, PY_PRIOR,
ALGO_SETTINGS.format(algo), dataset,
return_num_clusters=True, # out [1]
return_clusters=False, return_best_clus=False)
log_file = os.path.join(log_fold, 'bayesmix_{0}_{1}.log'.format(name, algo))
with open(log_file, 'w') as f:
with redirect_stdout(f):
out = run_mcmc(g0_name, "PY", dataset, g0_params, PY_PRIOR,
ALGO_SETTINGS.format(algo), dataset,
return_num_clusters=True, # out [1]
return_clusters=False, return_best_clus=False)
out_dens[name][algo] = out[0]
out_clus[name][algo] = out[1]
return out_dens, out_clus
Expand All @@ -184,143 +184,143 @@ def run_bayesmix(log_fold, dataset, name, g0_params, univariate: bool):
for fold in (log_fold, csv_fold, png_fold, build_fold):
os.makedirs(fold, exist_ok=True)

# subprocess.call("cmake .. -DDISABLE_BENCHMARKS=ON".split(), cwd=build_fold)
# subprocess.call("make plot_mcmc -j4".split(), cwd=build_fold)
subprocess.call("cmake .. -DDISABLE_BENCHMARKS=ON".split(), cwd=build_fold)
subprocess.call("make plot_mcmc -j4".split(), cwd=build_fold)

build_bayesmix(4)

###########################
## COMMAND LINE EXAMPLES ##
###########################

# run_cmd = """build/run_mcmc
# --algo-params-file resources/tutorial/algo.asciipb
# --hier-type NNIG --hier-args resources/tutorial/nnig_ngg.asciipb
# --mix-type DP --mix-args resources/tutorial/dp_gamma.asciipb
# --coll-name python/{0}/chains.recordio
# --data-file resources/tutorial/data.csv
# --grid-file resources/tutorial/grid.csv
# --dens-file python/{1}/cmdline_density.csv
# --n-cl-file python/{1}/cmdline_numclust.csv
# --clus-file python/{1}/cmdline_clustering.csv
# --best-clus-file python/{1}/cmdline_best_clustering.csv
# """.format(log_fold, csv_fold)
# subprocess.call(run_cmd.split(), cwd="../")

# plt_cmd = """build/plot_mcmc
# --grid-file resources/tutorial/grid.csv
# --dens-file python/{0}/cmdline_density.csv
# --dens-plot python/{1}/cmdline_density.png
# --n-cl-file python/{0}/cmdline_numclust.csv
# --n-cl-trace-plot python/{1}/cmdline_traceplot.png
# --n-cl-bar-plot python/{1}/cmdline_nclus_barplot.png
# """.format(csv_fold, png_fold)
# subprocess.call(plt_cmd.split(), cwd="../")

# ###############################
# ## PYTHON UNIVARIATE EXAMPLE ##
# ###############################
# data = np.concatenate([
# np.random.normal(loc=3, scale=1, size=100),
# np.random.normal(loc=-3, scale=1, size=100),
# ])

# dp_params = """
# fixed_value {
# totalmass: 1.0
# }
# """

# g0_params = """
# fixed_values {
# mean: 0.0
# var_scaling: 0.1
# shape: 2.0
# scale: 2.0
# }
# """

# algo_params = """
# algo_id: "Neal2"
# rng_seed: 20201124
# iterations: 2000
# burnin: 1000
# init_num_clusters: 3
# """

# dens_grid = np.linspace(-6, 6, 1000)

# log_dens, numcluschain, cluschain, bestclus = run_mcmc(
# "NNIG", "DP", data, g0_params, dp_params, algo_params,
# dens_grid=dens_grid, return_clusters=True, return_num_clusters=True,
# return_best_clus=True)

# fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
# axes[0].hist(data, alpha=0.2, density=True)
# for c in np.unique(bestclus):
# data_in_clus = data[bestclus == c]
# axes[0].scatter(data_in_clus, np.zeros_like(data_in_clus) + 0.01,
# label="Cluster {0}".format(int(c) + 1))
# axes[0].plot(dens_grid, np.exp(np.mean(log_dens, axis=0)), color="red", lw=3)
# axes[0].legend(fontsize=12, ncol=2, loc=1)
# axes[0].set_ylim(0, 0.3)


# x, y = np.unique(numcluschain, return_counts=True)
# axes[1].bar(x, y / y.sum())
# axes[1].set_xticks(x)

# axes[2].vlines(np.arange(len(numcluschain)), numcluschain-0.3, numcluschain+0.3)
# plt.savefig(os.path.join(png_fold, 'bayesmix_example_univariate.png'),
# dpi=300, bbox_inches='tight')


# ##############################
# ## PYTHON BIVARIATE EXAMPLE ##
# ##############################

# g0_params = """
# fixed_values {
# mean {
# size: 2
# data: [3.484, 3.487]
# }
# var_scaling: 0.01
# deg_free: 5
# scale {
# rows: 2
# cols: 2
# data: [1.0, 0.0, 0.0, 1.0]
# rowmajor: false
# }
# }
# """

# data = np.loadtxt('../resources/datasets/faithful.csv', delimiter=',')
# xgrid = np.linspace(0, 6, 50)
# xgrid, ygrid = np.meshgrid(xgrid, xgrid)
# dens_grid = np.hstack([xgrid.reshape(-1, 1), ygrid.reshape(-1, 1)])

# log_dens, numcluschain, _, best_clus_dp = run_mcmc(
# "NNW", "DP", data, g0_params, dp_params, algo_params,
# dens_grid, return_clusters=False, return_num_clusters=True,
# return_best_clus=True)

# fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
# mean_dens_dp = np.mean(log_dens, axis=0)

# axes[0].contour(xgrid, ygrid, mean_dens_dp.reshape(xgrid.shape))
# for c in np.unique(best_clus_dp):
# currdata = data[best_clus_dp == c, :]
# axes[0].scatter(currdata[:, 0], currdata[:, 1])

# x, y = np.unique(numcluschain, return_counts=True)
# axes[1].bar(x, y / y.sum())
# axes[1].set_xticks(x)

# axes[2].vlines(np.arange(len(numcluschain)), numcluschain-0.3, numcluschain+0.3)
# plt.savefig(os.path.join(png_fold, 'bayesmix_example_bivariate.png'),
# dpi=300, bbox_inches='tight')
run_cmd = """build/run_mcmc
--algo-params-file resources/tutorial/algo.asciipb
--hier-type NNIG --hier-args resources/tutorial/nnig_ngg.asciipb
--mix-type DP --mix-args resources/tutorial/dp_gamma.asciipb
--coll-name python/{0}/chains.recordio
--data-file resources/tutorial/data.csv
--grid-file resources/tutorial/grid.csv
--dens-file python/{1}/cmdline_density.csv
--n-cl-file python/{1}/cmdline_numclust.csv
--clus-file python/{1}/cmdline_clustering.csv
--best-clus-file python/{1}/cmdline_best_clustering.csv
""".format(log_fold, csv_fold)
subprocess.call(run_cmd.split(), cwd="../")

plt_cmd = """build/plot_mcmc
--grid-file resources/tutorial/grid.csv
--dens-file python/{0}/cmdline_density.csv
--dens-plot python/{1}/cmdline_density.png
--n-cl-file python/{0}/cmdline_numclust.csv
--n-cl-trace-plot python/{1}/cmdline_traceplot.png
--n-cl-bar-plot python/{1}/cmdline_nclus_barplot.png
""".format(csv_fold, png_fold)
subprocess.call(plt_cmd.split(), cwd="../")

###############################
## PYTHON UNIVARIATE EXAMPLE ##
###############################
data = np.concatenate([
np.random.normal(loc=3, scale=1, size=100),
np.random.normal(loc=-3, scale=1, size=100),
])

dp_params = """
fixed_value {
totalmass: 1.0
}
"""

g0_params = """
fixed_values {
mean: 0.0
var_scaling: 0.1
shape: 2.0
scale: 2.0
}
"""

algo_params = """
algo_id: "Neal2"
rng_seed: 20201124
iterations: 2000
burnin: 1000
init_num_clusters: 3
"""

dens_grid = np.linspace(-6, 6, 1000)

log_dens, numcluschain, cluschain, bestclus = run_mcmc(
"NNIG", "DP", data, g0_params, dp_params, algo_params,
dens_grid=dens_grid, return_clusters=True, return_num_clusters=True,
return_best_clus=True)

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
axes[0].hist(data, alpha=0.2, density=True)
for c in np.unique(bestclus):
data_in_clus = data[bestclus == c]
axes[0].scatter(data_in_clus, np.zeros_like(data_in_clus) + 0.01,
label="Cluster {0}".format(int(c) + 1))
axes[0].plot(dens_grid, np.exp(np.mean(log_dens, axis=0)), color="red", lw=3)
axes[0].legend(fontsize=12, ncol=2, loc=1)
axes[0].set_ylim(0, 0.3)


x, y = np.unique(numcluschain, return_counts=True)
axes[1].bar(x, y / y.sum())
axes[1].set_xticks(x)

axes[2].vlines(np.arange(len(numcluschain)), numcluschain-0.3, numcluschain+0.3)
plt.savefig(os.path.join(png_fold, 'bayesmix_example_univariate.png'),
dpi=300, bbox_inches='tight')


##############################
## PYTHON BIVARIATE EXAMPLE ##
##############################

g0_params = """
fixed_values {
mean {
size: 2
data: [3.484, 3.487]
}
var_scaling: 0.01
deg_free: 5
scale {
rows: 2
cols: 2
data: [1.0, 0.0, 0.0, 1.0]
rowmajor: false
}
}
"""

data = np.loadtxt('../resources/datasets/faithful.csv', delimiter=',')
xgrid = np.linspace(0, 6, 50)
xgrid, ygrid = np.meshgrid(xgrid, xgrid)
dens_grid = np.hstack([xgrid.reshape(-1, 1), ygrid.reshape(-1, 1)])

log_dens, numcluschain, _, best_clus_dp = run_mcmc(
"NNW", "DP", data, g0_params, dp_params, algo_params,
dens_grid, return_clusters=False, return_num_clusters=True,
return_best_clus=True)

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
mean_dens_dp = np.mean(log_dens, axis=0)

axes[0].contour(xgrid, ygrid, mean_dens_dp.reshape(xgrid.shape))
for c in np.unique(best_clus_dp):
currdata = data[best_clus_dp == c, :]
axes[0].scatter(currdata[:, 0], currdata[:, 1])

x, y = np.unique(numcluschain, return_counts=True)
axes[1].bar(x, y / y.sum())
axes[1].set_xticks(x)

axes[2].vlines(np.arange(len(numcluschain)), numcluschain-0.3, numcluschain+0.3)
plt.savefig(os.path.join(png_fold, 'bayesmix_example_bivariate.png'),
dpi=300, bbox_inches='tight')


############################
Expand All @@ -333,12 +333,12 @@ def run_bayesmix(log_fold, dataset, name, g0_params, univariate: bool):
datasets = {}
datasets["galaxy"] = np.loadtxt('../resources/datasets/galaxy.csv', delimiter=',')
datasets["faithful"] = np.loadtxt('../resources/datasets/faithful.csv', delimiter=',')
# datasets["highdim"] = np.loadtxt(highdim_data_file, delimiter=",")
datasets["highdim"] = np.loadtxt(highdim_data_file, delimiter=",")

g0_params = {}
g0_params["galaxy"] = GALAXY_GO
g0_params["faithful"] = FAITHFUL_G0
# g0_params["highdim"] = HIGHDIM_G0
g0_params["highdim"] = HIGHDIM_G0

bayesmix_densities = {}
bayesmix_num_clust = {}
Expand All @@ -353,11 +353,10 @@ def run_bayesmix(log_fold, dataset, name, g0_params, univariate: bool):

## RUN BNPMIX VIA SUBPROCESS
bnpmix_algos = ('MAR', 'ICS')
# subprocess.call("Rscript run_bnpmix.R".split(), cwd="reproducibility")
subprocess.call("Rscript run_bnpmix.R".split(), cwd="reproducibility")

# COMPUTE ESS and TIMES
# datasets = ["galaxy", "faithful", "highdim"]
datasets = ["galaxy", "faithful"]
datasets = ["galaxy", "faithful", "highdim"]

ESS = pd.DataFrame(columns=datasets)
times = pd.DataFrame(columns=datasets)
Expand Down

0 comments on commit 9617049

Please sign in to comment.