diff --git a/Snakefile b/Snakefile index 53a1dc1..24b5c09 100644 --- a/Snakefile +++ b/Snakefile @@ -390,7 +390,7 @@ rule rw_4096_true_cov_true_128: f'src/code/random_walks/truecov_rw.py' for start_diff in [0, 2, 4, 6, 8, 10, 15, 20]: - for n in range(0, 16, 1): + for n in range(0, 6, 1): rule: name: f"llzo_many_{n}_{start_diff}" @@ -415,7 +415,7 @@ for start_diff in [0, 2, 4, 6, 8, 10, 15, 20]: f"true_D_llzo_{start_diff}" input: 'src/code/llzo/true_ls.py', - [f'src/data/llzo/diffusion_{n}_{start_diff}.npz' for n in range(0, 16, 1)] + [f'src/data/llzo/diffusion_{n}_{start_diff}.npz' for n in range(0, 6, 1)] output: f'src/data/llzo/true_{start_diff}.npz' conda: @@ -432,7 +432,7 @@ for start_diff in [0, 2, 4, 6, 8, 10, 15, 20]: f"glswlsols_D_llzo_{start_diff}" input: 'src/code/llzo/glswlsols.py', - [f'src/data/llzo/diffusion_{n}_{start_diff}.npz' for n in range(0, 16, 1)] + [f'src/data/llzo/diffusion_{n}_{start_diff}.npz' for n in range(0, 6, 1)] output: f'src/data/llzo/glswlsols_{start_diff}.npz' conda: diff --git a/environment.yml b/environment.yml index f2bb481..99ebb70 100644 --- a/environment.yml +++ b/environment.yml @@ -7,8 +7,8 @@ dependencies: - python=3.10 - scipy=1.9.3 - scikit-learn=1.1.3 - - mdanalysis=2.3.0 + - mdanalysis=2.6.1 - pip: - - git+https://github.com/matplotlib/matplotlib.git + - matplotlib==3.8.2 - pyblock==0.6 - - git+https://github.com/bjmorgan/kinisi.git + - git+https://github.com/bjmorgan/kinisi.git@with-f \ No newline at end of file diff --git a/showyourwork.yml b/showyourwork.yml index fa644e3..194a876 100644 --- a/showyourwork.yml +++ b/showyourwork.yml @@ -11,25 +11,25 @@ margin_icons: sandbox: "0.25,0.25,0.25" horizontal_offset: -4 -datasets: - 10.5281/zenodo.7602139: - contents: - traj0.xyz: src/data/llzo/traj0.xyz - traj1.xyz: src/data/llzo/traj1.xyz - traj2.xyz: src/data/llzo/traj2.xyz - traj3.xyz: src/data/llzo/traj3.xyz - traj4.xyz: src/data/llzo/traj4.xyz - traj5.xyz: src/data/llzo/traj5.xyz - traj6.xyz: src/data/llzo/traj6.xyz - traj7.xyz: src/data/llzo/traj7.xyz - traj8.xyz: src/data/llzo/traj8.xyz - traj9.xyz: src/data/llzo/traj9.xyz - traj10.xyz: src/data/llzo/traj10.xyz - traj11.xyz: src/data/llzo/traj11.xyz - traj12.xyz: src/data/llzo/traj12.xyz - traj13.xyz: src/data/llzo/traj13.xyz - traj14.xyz: src/data/llzo/traj14.xyz - traj15.xyz: src/data/llzo/traj15.xyz +# datasets: +# 10.5281/zenodo.7602139: +# contents: +# traj0.xyz: src/data/llzo/traj0.xyz +# traj1.xyz: src/data/llzo/traj1.xyz +# traj2.xyz: src/data/llzo/traj2.xyz +# traj3.xyz: src/data/llzo/traj3.xyz +# traj4.xyz: src/data/llzo/traj4.xyz +# traj5.xyz: src/data/llzo/traj5.xyz +# traj6.xyz: src/data/llzo/traj6.xyz +# traj7.xyz: src/data/llzo/traj7.xyz +# traj8.xyz: src/data/llzo/traj8.xyz +# traj9.xyz: src/data/llzo/traj9.xyz +# traj10.xyz: src/data/llzo/traj10.xyz +# traj11.xyz: src/data/llzo/traj11.xyz +# traj12.xyz: src/data/llzo/traj12.xyz +# traj13.xyz: src/data/llzo/traj13.xyz +# traj14.xyz: src/data/llzo/traj14.xyz +# traj15.xyz: src/data/llzo/traj15.xyz dependencies: src/ms.tex: @@ -45,7 +45,7 @@ dependencies: - src/data/random_walks/numerical/D_1_128_128.npz src/scripts/diffusion.py: - src/data/llzo/true_10.npz - {% for n in range(0, 16, 1) %} + {% for n in range(0, 6, 1) %} - src/data/llzo/diffusion_{{n}}_10.npz {% endfor %} - src/scripts/utils/_fig_params.py diff --git a/src/code/llzo/glswlsols.py b/src/code/llzo/glswlsols.py index d5be2b6..9ec4e4d 100644 --- a/src/code/llzo/glswlsols.py +++ b/src/code/llzo/glswlsols.py @@ -15,9 +15,12 @@ start_diff = snakemake.params['start_diff'] timestep = np.load(f'src/data/llzo/diffusion_0_{start_diff}.npz')['dt'][0, 0] -no = np.load(f'src/data/llzo/diffusion_0_{start_diff}.npz')['no'][0, 0] -d = np.zeros((16, 8, 1, timestep.size)) -for i in range(0, 16, 1): +length = 2000 +ll = len([i + length for i in range(0, 20001 - length, length)]) +# length = 500 +# ll = len([i + length for i in range(0, 2000, length)]) +d = np.zeros((6, 8, ll, timestep.size)) +for i in range(0, 6, 1): d[i] = np.load(f'src/data/llzo/diffusion_{i}_{start_diff}.npz')['msd_true'] max_ngp = np.argwhere(timestep > start_diff)[0][0] @@ -31,16 +34,16 @@ V = np.diag(true_cov.diagonal()) g1 = np.matmul(np.linalg.inv(np.matmul(A.T, np.matmul(np.linalg.pinv(W), A))), - np.matmul(A.T, np.matmul(np.linalg.pinv(W), y)))[0] / 6 -c1 = np.sqrt(np.linalg.inv(np.matmul(A.T, np.matmul(np.linalg.pinv(W), A)))[0][0]) / 6 + np.matmul(A.T, np.matmul(np.linalg.pinv(W), y)))[0] / 6e4 +c1 = np.sqrt(np.linalg.inv(np.matmul(A.T, np.matmul(np.linalg.pinv(W), A)))[0][0]) / 6e4 g2 = np.matmul(np.linalg.inv(np.matmul(A.T, np.matmul(np.linalg.pinv(V), A))), - np.matmul(A.T, np.matmul(np.linalg.pinv(V), y)))[0] / 6 -c2 = np.sqrt(np.linalg.inv(np.matmul(A.T, np.matmul(np.linalg.pinv(V), A)))[0][0]) / 6 -g3 = np.matmul(np.linalg.inv(np.matmul(A.T, A)), np.matmul(A.T, y))[0] / 6 + np.matmul(A.T, np.matmul(np.linalg.pinv(V), y)))[0] / 6e4 +c2 = np.sqrt(np.linalg.inv(np.matmul(A.T, np.matmul(np.linalg.pinv(V), A)))[0][0]) / 6e4 +g3 = np.matmul(np.linalg.inv(np.matmul(A.T, A)), np.matmul(A.T, y))[0] / 6e4 c3 = [] for i in true_msd: - c3.append(linregress(timestep, i).stderr / 6) + c3.append(linregress(timestep, i).stderr / 6e4) np.savez(f'src/data/llzo/glswlsols_{start_diff}.npz', gls_pop=g1, diff --git a/src/code/llzo/many_runs.py b/src/code/llzo/many_runs.py index 5277eea..6134fa6 100644 --- a/src/code/llzo/many_runs.py +++ b/src/code/llzo/many_runs.py @@ -2,97 +2,84 @@ import numpy as np from kinisi.parser import MDAnalysisParser from kinisi.diffusion import MSDBootstrap -from tqdm import tqdm n = snakemake.params['n'] start_diff = snakemake.params['start_diff'] -def universe_slice(u: mda.Universe, data_file: str, slice_start: int, slice_end: int) -> mda.Universe: +step = 8 + +def universe_slice(data: str, slice_start: int, slice_end: int, m: int) -> mda.Universe: """ - Concatenate a list of Universes. + Create slice universe. - :param u_list: List of Universe objects - :param data_file: File path for data + :param data: Numpy data of coordinates :param slice_start: Step to start the slice from + :param slice_end: Step to end the slice from :return: A single flattened Universe """ - j = 0 - trajectory = np.zeros((len(u.trajectory[slice_start:slice_end]), len(u.atoms), 3)) - for t in tqdm(u.trajectory[slice_start:slice_end]): - trajectory[j] = t.positions - j += 1 - shorter_universe = mda.Universe(data_file, trajectory) - for t in tqdm(shorter_universe.trajectory): - for i in shorter_universe.atoms: - if i.type == 'LI': - i.type = 'Li' - if i.type == 'Li1': - i.type = 'Li' - if i.type == 'Li2': - i.type = 'Li' - if i.type == 'L': - i.type = 'La' - if i.type == 'Z': - i.type = 'Zr' - t.dimensions = np.array([26.175106662244392197, 26.175106662244392197, 26.175106662244392197, 90, 90, 90]) - return shorter_universe - -file = f'src/data/llzo/traj{n}.xyz' + data_reshape = np.copy(data).reshape(-1, 1536, 3) + data_subset = data_reshape[slice_start:slice_end, m::step] + data_subset *= 0.52917721067 + u = mda.Universe.empty(data_subset.shape[1], 1, n_frames=data_subset.shape[0], trajectory=True) + list_type = ['A'] * (1536-448) + ['Li'] * 448 + u.add_TopologyAttr('type', list_type[m::step]) + for i, t in enumerate(u.trajectory): + t.positions = data_subset[i] + t.dimensions = np.array([26.30938114, 26.30938114, 26.30938114, 90, 90, 90]) + return u - -u = mda.Universe(file, file) -uu = universe_slice(u, file, 0, 2000) +data = np.loadtxt(f'src/data/llzo/traj{n}.out', usecols=[0, 1, 2]) +length = 2000 +uu = universe_slice(data, 0, length, 0) step_skip = 100 # sampling rate timestep = 5.079648e-4 # ps da_params = {'specie': 'Li', 'time_step': timestep, 'step_skip': step_skip} p = MDAnalysisParser(uu, **da_params) -ngp = np.zeros((8, 1, p.delta_t.size)) -no = np.zeros((8, 1, p.delta_t.size)) -dt = np.zeros((8, 1, p.delta_t.size)) -msd = np.zeros((8, 1, p.delta_t.size)) -msd_true = np.zeros((8, 1, p.delta_t.size)) -msd_std = np.zeros((8, 1, p.delta_t.size)) -cov = np.zeros((8, 1, p.delta_t[np.where(p.delta_t > start_diff)].size, p.delta_t[np.where(p.delta_t > start_diff)].size)) -d = np.zeros((8, 1, 3200)) -intercept = np.zeros((8, 1, 3200)) +ll = len([i + length for i in range(0, 20001 - length, length)]) +dt = np.zeros((step, ll, p.delta_t.size)) +msd = np.zeros((step, ll, p.delta_t.size)) +msd_true = np.zeros((step, ll, p.delta_t.size)) +msd_std = np.zeros((step, ll, p.delta_t.size)) +n_o = np.zeros((step, ll, p.delta_t.size)) +cov = np.zeros((step, ll, p.delta_t[np.where(p.delta_t > start_diff)].size, p.delta_t[np.where(p.delta_t > start_diff)].size)) +d = np.zeros((step, ll, 3200)) +g = np.zeros((step, ll, 3200)) +f = np.zeros((step, ll, p.delta_t.size)) +intercept = np.zeros((step, ll, 3200)) -for m in range(0, 8, 1): - for i, slice in enumerate(range(0, 2000, 2000)): - uu = universe_slice(u, file, slice, slice+2000) +for m in range(0, step, 1): + for i, slice in enumerate(range(0, 20001 - length, length)): + uu = universe_slice(data, slice, slice+length, m) rng = np.random.RandomState(42) np.random.seed(42) da_params = {'specie': 'Li', 'time_step': timestep, 'step_skip': step_skip} p = MDAnalysisParser(uu, **da_params) - n_disp_3d = [] - for t, j in enumerate(p.disp_3d): - if m == 0: - n_disp_3d.append(j[::8]) - msd_true[m, i, t] = np.sum(j[::8, ::p.timesteps[t]]**2, axis=-1).mean() - else: - n_disp_3d.append(j[m::8]) - msd_true[m, i, t] = np.sum(j[m::8, ::p.timesteps[t]]**2, axis=-1).mean() - - b = MSDBootstrap(p.delta_t, n_disp_3d, p._n_o / 8) + b = MSDBootstrap(p.delta_t, p.disp_3d, p._n_o) b.diffusion(start_diff, random_state=rng) - ngp[m, i] = b.ngp - no[m, i] = p._n_o / 8 + # print(b._popt) dt[m, i] = b.dt msd[m, i] = b.n + msd_true[m, i] = b.n msd_std[m, i] = b.s + n_o[m, i] = p._n_o cov[m, i] = b.covariance_matrix d[m, i] = b.D.samples + g[m, i] = b.gradient.samples + f[m, i] = b._f + print(b._f) intercept[m, i] = b.intercept.samples np.savez(f'src/data/llzo/diffusion_{n}_{start_diff}.npz', - ngp=ngp, - no=no, dt=dt, msd_true=msd_true, msd=msd, msd_std=msd_std, + n_o = n_o, cov=cov, d=d, + g=g, + f=f, intercept=intercept) \ No newline at end of file diff --git a/src/code/llzo/true_ls.py b/src/code/llzo/true_ls.py index c119217..c45a511 100644 --- a/src/code/llzo/true_ls.py +++ b/src/code/llzo/true_ls.py @@ -14,9 +14,12 @@ start_diff = snakemake.params['start_diff'] timestep = np.load(f'src/data/llzo/diffusion_0_{start_diff}.npz')['dt'][0, 0] -no = np.load(f'src/data/llzo/diffusion_0_{start_diff}.npz')['no'][0, 0] -d = np.zeros((16, 8, 1, timestep.size)) -for i in range(0, 16, 1): +length = 2000 +ll = len([i + length for i in range(0, 20001 - length, length)]) +# length = 500 +# ll = len([i + length for i in range(0, 2000, length)]) +d = np.zeros((6, 8, ll, timestep.size)) +for i in range(0, 6, 1): d[i] = np.load(f'src/data/llzo/diffusion_{i}_{start_diff}.npz')['msd_true'] true_mean = d.reshape(-1, d.shape[-1]).mean(0) diff --git a/src/scripts/diffusion.py b/src/scripts/diffusion.py index b5ca7fc..5e9b635 100644 --- a/src/scripts/diffusion.py +++ b/src/scripts/diffusion.py @@ -13,8 +13,12 @@ type = 'kinisi' dllzo_true = np.load(paths.data / "llzo/true_10.npz")['diff_c'] -d = np.zeros((16, 8, 1, 3200)) -for i in range(0, 16, 1): +length = 2000 +ll = len([i + length for i in range(0, 20001 - length, length)]) +# length = 500 +# ll = len([i + length for i in range(0, 2000, length)]) +d = np.zeros((6, 8, ll, 3200)) +for i in range(0, 6, 1): d[i] = np.load(paths.data / f'llzo/diffusion_{i}_10.npz')['d'] d = d.reshape(-1, 3200) llzo_x = np.linspace(dllzo_true.min(), dllzo_true.max(), 1000) @@ -26,6 +30,8 @@ axes = [] titles = [] +print(d.shape) + axes.append(fig.add_subplot(gs[0, 0])) y, x = np.histogram(d.mean(-1), bins=int(fp.NBINS * 0.5), density=True) axes[-1].stairs(y, x, fill=True, color='#ECBCA7', label='$p(\hat{D}^*)$') diff --git a/src/scripts/random_walk.py b/src/scripts/random_walk.py index 386f208..7d07b40 100644 --- a/src/scripts/random_walk.py +++ b/src/scripts/random_walk.py @@ -75,11 +75,11 @@ norm.pdf(rw_x, dinfty_true.mean(), dinfty_true.std()), color='#B8B8B8', label='$p(\hat{D}^*_{\mathrm{num}})$') -axes[-1].plot([true.mean(-1).mean() - true.mean(-1).std(), - true.mean(-1).mean() + true.mean(-1).std()], +axes[-1].plot([dinfty_true.mean() - dinfty_true.std(), + dinfty_true.mean() + dinfty_true.std()], [axes[-1].get_ylim()[1] * 1.1, axes[-1].get_ylim()[1] * 1.1], color='#F3ADBC', - label='$\sigma[\hat{D}^*]$', + label='$\sigma[\hat{D}^*_{\mathrm{num}}]$', marker='|') axes[-1].set_yticks([0, 10, 20]) axes[-1].set_xlabel(r"$\hat{D}^*$") @@ -93,14 +93,14 @@ axes.append(fig.add_subplot(gs[0, 3])) y, x = np.histogram(true.var(-1, ddof=1), bins=fp.NBINS, density=True) axes[-1].stairs(y, x, fill=True, color='#B3CBE8', label='$p(\hat{\sigma}^2[D^*])$') -axes[-1].axvline(true.mean(-1).var(ddof=1), +axes[-1].axvline(dinfty_true.var(ddof=1), ymax=0.85, c='#F3ADBC', - label='$\sigma^2[\hat{D}^*]$', + label=r'$\sigma^2[\hat{D}^*_{\mathrm{num}}]$', ls='-') axes[-1].set_xlabel(r"$\sigma^2 [\hat{D}^*]$") axes[-1].set_ylabel(r"$p(\sigma^2 [\hat{D}^*])$") -axes[-1].set_xlim([0, None]) +axes[-1].set_xlim([0, 3.5e-3]) titles.append("d") axes[-1].set_title("all simulations") axes[-1].legend(loc='upper left', bbox_to_anchor=(0.5, 1)) diff --git a/src/tex/ms.tex b/src/tex/ms.tex index 1228844..acb893a 100644 --- a/src/tex/ms.tex +++ b/src/tex/ms.tex @@ -174,17 +174,17 @@ \subsection{Background} These variances are also typically unequal---the data are heteroscedastic \cite{smith_random_1996,he_statistical_2018,UslerEtAl_JComputChem2023}. Because the key assumptions of the OLS method are not valid for MSD data, OLS gives statistically inefficient estimates of $\D$, while the estimated regression uncertainties obtained from the standard OLS statistical formulae significantly underestimate the true uncertainty in $\prob{\Dest_\mathrm{OLS}}$ (Fig.~\ref{fig:glswlsols}a). -\begin{figure} - \centering - % \resizebox{\columnwidth}{!}{\includegraphics*{figure1.pdf}} - \includegraphics[width=\columnwidth]{glswlsols.pdf} - \script{glswlsols.py} - \caption{ - Example distributions of estimated self-diffusion coefficients, $\Dest$, calculated using (a) ordinary least squares (OLS), (b) weighted least squares (WLS), and (c) generalised least squares (GLS), - from MSD data from \num{4096} individual simulations of \num{128} particles undergoing a \SI{128}{step} 3D lattice random walk, with a step size chosen so that the true diffusion coefficient $\D = 1$. - In each panel, the grey curve shows the best-fit normal distribution for the simulation data, the upper horizontal bar shows the standard deviation of this distribution, and the lower horizontal bar shows the average estimated standard distribution given by the analytical expression for $\sigma[\prob{\Dest}]$ for each regression method.} - \label{fig:glswlsols} -\end{figure} +% \begin{figure} +% \centering +% % \resizebox{\columnwidth}{!}{\includegraphics*{figure1.pdf}} +% \includegraphics[width=\columnwidth]{glswlsols.pdf} +% \script{glswlsols.py} +% \caption{ +% Example distributions of estimated self-diffusion coefficients, $\Dest$, calculated using (a) ordinary least squares (OLS), (b) weighted least squares (WLS), and (c) generalised least squares (GLS), +% from MSD data from \num{4096} individual simulations of \num{128} particles undergoing a \SI{128}{step} 3D lattice random walk, with a step size chosen so that the true diffusion coefficient $\D = 1$. +% In each panel, the grey curve shows the best-fit normal distribution for the simulation data, the upper horizontal bar shows the standard deviation of this distribution, and the lower horizontal bar shows the average estimated standard distribution given by the analytical expression for $\sigma[\prob{\Dest}]$ for each regression method.} +% \label{fig:glswlsols} +% \end{figure} Some improvement can be made by using weighted least squares (WLS) (Fig.~\ref{fig:glswlsols}b), where the residual for each observed MSD value is weighted by the reciprocal of its variance, $1/(\var{\oMSDi})$. Like OLS, WLS is an unbiased estimator, and for heteroscedastic data it has higher statistical efficiency than OLS. @@ -268,20 +268,20 @@ \subsection{Approximating $\bm{\Sigma}$ from simulation data} We also compute the MSD and estimated variance using data from a single simulation trajectory (Fig.~\ref{fig:msd}b), using the scheme described above. A quantitative comparison between the true MSD variance and the single-trajectory estimated MSD variance is made in Fig.~\ref{fig:msd}c: the close numerical agreement confirms that Eqn~\ref{equ:varestMSD} can be used to estimate $\var{\oMSD}$, which can then be used to parameterise the model covariance matrix $\mathbf{\Sigma^\prime}$ via Eqn.~\ref{equ:cvv}. % -\begin{figure} - \centering - % \resizebox{\columnwidth}{!}{\includegraphics*{figure2.pdf}} - \includegraphics[width=\columnwidth]{msd.pdf} - \script{msd.py} - \caption{ - Comparison of the numerical variance in observed MSD from multiple replica simulations and the estimated variance in observed MSD given by rescaling the variance in observed squared displacements (Eqn.~\ref{equ:varestMSD}). - Panel (a) shows the mean observed MSD from \num{4096} simulations of \num{128} particles undergoing a 3D lattice random walk of \num{128} steps per particle, with error bars of $\pm2\sigma[\oMSDi]$. - Panel (b) shows the MSD from just one simulation, with error bars of $\pm2\widehat{\sigma}[\oMSDi]$, obtained via Eqn.~\ref{equ:varestMSD}. - Panel (c) plots the numerical variance against the estimated variance from a single simulation as a function of timestep $i$. - %Raw simulation data~\cite{mccluskey_zenodo_2022} and scripts~\cite{mccluskey_github_2022} to generate this figure are available under CC BY 4.0/MIT licences. - } - \label{fig:msd} -\end{figure} +% \begin{figure} +% \centering +% % \resizebox{\columnwidth}{!}{\includegraphics*{figure2.pdf}} +% \includegraphics[width=\columnwidth]{msd.pdf} +% \script{msd.py} +% \caption{ +% Comparison of the numerical variance in observed MSD from multiple replica simulations and the estimated variance in observed MSD given by rescaling the variance in observed squared displacements (Eqn.~\ref{equ:varestMSD}). +% Panel (a) shows the mean observed MSD from \num{4096} simulations of \num{128} particles undergoing a 3D lattice random walk of \num{128} steps per particle, with error bars of $\pm2\sigma[\oMSDi]$. +% Panel (b) shows the MSD from just one simulation, with error bars of $\pm2\widehat{\sigma}[\oMSDi]$, obtained via Eqn.~\ref{equ:varestMSD}. +% Panel (c) plots the numerical variance against the estimated variance from a single simulation as a function of timestep $i$. +% %Raw simulation data~\cite{mccluskey_zenodo_2022} and scripts~\cite{mccluskey_github_2022} to generate this figure are available under CC BY 4.0/MIT licences. +% } +% \label{fig:msd} +% \end{figure} % The practical implementation of both GLS and Bayesian regression requires that the covariance matrix $\mathbf{\Sigma^\prime}$ is invertible (positive definite); see Eqns.~\ref{equ:gls} and \ref{equ:loglike}. @@ -300,21 +300,21 @@ \subsection{Approximating $\bm{\Sigma}$ from simulation data} the numerically converged covariance matrix, $\bm{\Sigma}$, computed using the data from all \num{4096} simulations (Fig.~\ref{fig:covariances}a); the corresponding analytical model covariance matrix, $\mathbf{\Sigma^\prime}$, defined by Eqn.~\ref{equ:cvv} and parametrised using analytical variances $\var{\oMSDi}$ (Fig.~\ref{fig:covariances}b); and the average model covariance matrix obtained by parametrising Eqn.~\ref{equ:cvv} using smoothed variances estimated from individual simulation trajectories, and averaging over the resulting set of all \num{4096} matrices (Fig.~\ref{fig:covariances}c). % -\begin{figure} - \centering - % \resizebox{\columnwidth}{!}{\includegraphics*{figure3.pdf}} - \includegraphics[width=\columnwidth]{covariances.pdf} - \script{covariances.py} - \caption{ - (a) The numerical MSD covariance matrix $\bm{\Sigma}$ calculated using MSD data from \num{4096} simulations of \num{128} particles undergoing a 3D lattice random walk of \num{128} steps per particle. - (b) The analytical MSD covariance matrix $\bm{\Sigma^\prime}$ (Eqn.~\ref{equ:cvv}), parametrised using analytical random-walk variances $\var{\oMSDi}$. - (c) The MSD covariance matrix obtained applying the numerical scheme described in the main text to each individual random walk simulation, averaged over all \num{4096} such simulations. - Colour bars in (a--c) show the covariance, $\Sigma\left[\oMSDi, \oMSDj\right]$. - The off-diagonal panels show difference plots, computed as per-element ratios between pairs of covariance matrices (a--c). - %Raw simulation data~\cite{mccluskey_zenodo_2022} and scripts~\cite{mccluskey_github_2022} to generate this figure are available under CC BY 4.0/MIT licences. - } - \label{fig:covariances} -\end{figure} +% \begin{figure} +% \centering +% % \resizebox{\columnwidth}{!}{\includegraphics*{figure3.pdf}} +% \includegraphics[width=\columnwidth]{covariances.pdf} +% \script{covariances.py} +% \caption{ +% (a) The numerical MSD covariance matrix $\bm{\Sigma}$ calculated using MSD data from \num{4096} simulations of \num{128} particles undergoing a 3D lattice random walk of \num{128} steps per particle. +% (b) The analytical MSD covariance matrix $\bm{\Sigma^\prime}$ (Eqn.~\ref{equ:cvv}), parametrised using analytical random-walk variances $\var{\oMSDi}$. +% (c) The MSD covariance matrix obtained applying the numerical scheme described in the main text to each individual random walk simulation, averaged over all \num{4096} such simulations. +% Colour bars in (a--c) show the covariance, $\Sigma\left[\oMSDi, \oMSDj\right]$. +% The off-diagonal panels show difference plots, computed as per-element ratios between pairs of covariance matrices (a--c). +% %Raw simulation data~\cite{mccluskey_zenodo_2022} and scripts~\cite{mccluskey_github_2022} to generate this figure are available under CC BY 4.0/MIT licences. +% } +% \label{fig:covariances} +% \end{figure} % While the analytical and average estimated covariance matrices show some systematic deviation from the numerically converged covariance matrix, the general correlation structure is preserved. @@ -323,26 +323,26 @@ \subsection{Approximating $\bm{\Sigma}$ from simulation data} \subsection{Validation} -\begin{figure*}[ht!] - \centering - % \resizebox{\textwidth}{!}{\includegraphics*{figure4.pdf}} - \includegraphics[width=\textwidth]{random_walk.pdf} - \script{random_walk.py} - \caption{ - (a) Observed MSD from a single simulation of 128 particles undergoing a 3D-lattice random walk of 128 steps per particle (dark line). - The green shading shows the corresponding posterior distribution $\prob{\model|\oMSD}$ of linear models compatible with the observed MSD data $\oMSD$, calculated using the scheme described in the main text. - The variegated shading indicates compatibility intervals of \SIlist[list-final-separator = {, and }]{1;2;3}{\sigma}$[\prob{\model|\oMSD}]$. - (b) The marginal posterior distribution $\prob{\Dest|\oMSD}$ obtained from the posterior distribution of linear models in (a). - The mean of this distribution gives the point estimate $\Dest$ for this simulation input data. - The blue horizontal bar shows an interval of one standard deviation in $\prob{\Dest|\oMSD}$. - (c) Probability distribution of point-estimates $\prob{\Dest}$ obtained from \num{4096} individual random-walk simulations. - Each simulation has been analysed as in (a) and (b) to yield a single corresponding point estimate $\Dest$. - The grey line shows the distribution of point estimates, $\prob{\Dest_\mathrm{num}}$, obtained using Bayesian regression with a mean vector and numerical covariance matrix derived from the complete dataset of all \num{4096} simulations. - The pink horizontal bar shows an interval of one standard deviation in $\prob{\Dest}$. - (d) Probability distribution of estimated variances, $\varest{\Dest}$, for individual random-walk simulations, compared to the true sample variance (pink vertical line) $\var{\Dest}$. - } - \label{fig:random_walk} -\end{figure*} +% \begin{figure*}[ht!] +% \centering +% % \resizebox{\textwidth}{!}{\includegraphics*{figure4.pdf}} +% \includegraphics[width=\textwidth]{random_walk.pdf} +% \script{random_walk.py} +% \caption{ +% (a) Observed MSD from a single simulation of 128 particles undergoing a 3D-lattice random walk of 128 steps per particle (dark line). +% The green shading shows the corresponding posterior distribution $\prob{\model|\oMSD}$ of linear models compatible with the observed MSD data $\oMSD$, calculated using the scheme described in the main text. +% The variegated shading indicates compatibility intervals of \SIlist[list-final-separator = {, and }]{1;2;3}{\sigma}$[\prob{\model|\oMSD}]$. +% (b) The marginal posterior distribution $\prob{\Dest|\oMSD}$ obtained from the posterior distribution of linear models in (a). +% The mean of this distribution gives the point estimate $\Dest$ for this simulation input data. +% The blue horizontal bar shows an interval of one standard deviation in $\prob{\Dest|\oMSD}$. +% (c) Probability distribution of point-estimates $\prob{\Dest}$ obtained from \num{4096} individual random-walk simulations. +% Each simulation has been analysed as in (a) and (b) to yield a single corresponding point estimate $\Dest$. +% The grey line shows the distribution of point estimates, $\prob{\Dest_\mathrm{num}}$, obtained using Bayesian regression with a mean vector and numerical covariance matrix derived from the complete dataset of all \num{4096} simulations. +% The pink horizontal bar shows an interval of one standard deviation in $\prob{\Dest}$. +% (d) Probability distribution of estimated variances, $\varest{\Dest}$, for individual random-walk simulations, compared to the true sample variance (pink vertical line) $\var{\Dest}$. +% } +% \label{fig:random_walk} +% \end{figure*} To demonstrate the complete approximate Bayesian regression scheme, as described above, we present two distinct examples. First, we consider a simple 3D-lattice random walk, where the true self-diffusion coefficient $\D$ is specified by the simulation parameters, and a well-converged numerical covariance matrix can be obtained with relatively low computational cost, which allows us to directly compare the estimates produced by our method to ``best case'' estimates from a hypothetical method with access to the true covariance matrix. @@ -397,20 +397,20 @@ \subsection{Validation} \subsection{$\var{\Dest}$ scaling and comparison to OLS, WLS, and GLS} -\begin{figure}[htb] - \centering - % \resizebox{\columnwidth}{!}{\includegraphics*{figure6.pdf}} - \includegraphics[width=\columnwidth]{stat_eff.pdf} - \script{stat_eff.py} - \caption{ - Scaling of $\var{\Dest}$ with simulation size for OLS (pink), WLS (blue), our approximate Bayesian regression method (green), and GLS (orange). - (a) Scaling versus number of mobile particles, $N_\mathrm{atoms}$. - (b) Scaling versus total simulation time, $t_\mathrm{max}$. - Solid lines show fitted power law relationships for each dataset. - The WLS and GLS data are obtained using numerically determined variances and covariance, respectively, from a set of \num{512} repeat simulations for each combination of $N_\mathrm{atoms}$ and $t_\mathrm{max}$. - } - \label{fig:stat_eff} -\end{figure} +% \begin{figure}[htb] +% \centering +% % \resizebox{\columnwidth}{!}{\includegraphics*{figure6.pdf}} +% \includegraphics[width=\columnwidth]{stat_eff.pdf} +% \script{stat_eff.py} +% \caption{ +% Scaling of $\var{\Dest}$ with simulation size for OLS (pink), WLS (blue), our approximate Bayesian regression method (green), and GLS (orange). +% (a) Scaling versus number of mobile particles, $N_\mathrm{atoms}$. +% (b) Scaling versus total simulation time, $t_\mathrm{max}$. +% Solid lines show fitted power law relationships for each dataset. +% The WLS and GLS data are obtained using numerically determined variances and covariance, respectively, from a set of \num{512} repeat simulations for each combination of $N_\mathrm{atoms}$ and $t_\mathrm{max}$. +% } +% \label{fig:stat_eff} +% \end{figure} Fig.~\ref{fig:stat_eff} presents an analysis of the variation in $\var{\Dest}$ as the number of mobile particles (Fig.~\ref{fig:stat_eff}a) and the total simulation time (number of steps) (Fig.~\ref{fig:stat_eff}b) are changed. We compare four methods for estimating $\D$ from the observed MSD data: OLS, WLS, the approximate Bayesian regression method described here, and GLS. @@ -772,14 +772,14 @@ \section{Comparison With Block Averaging Approach} When the fitting is not performed, the resulting covariance matrix is numerically unstable, leading to estimated values of $\D$ at extreme values. While when the fitting is performed and the blocking-estimated variances as used, there is no improvement in the estimation of $\D$ and the resulting distribution of estimates in the variance of $\prob{\D}$ is broader than when Eqn.~\ref{equ:varestMSD} is used. -\begin{figure} - \centering - % \resizebox{\columnwidth}{!}{\includegraphics*{figure1.pdf}} - \includegraphics[width=\columnwidth]{pyblock.pdf} - \script{pyblock.py} - \caption{Comparison of the distribution of $\Dest$ (a, c, \& e) and the estimated variances (b, d, \& f) from 4096 individual random walk simulations using variances approximated by Eqn.~\ref{equ:varestMSD} (a \& b), from block averaging with fitting to Eqn.~\ref{equ:variance} (c \& d), and without fitting (e \& f). Note that the bounds in a, c, \& e are defined by the extrema of the estimated values, showing the fact that without the fitting to Eqn.~\ref{equ:variance}, the resulting covariance matrix is numerically unstable.} - \label{fig:pyblock} -\end{figure} +% \begin{figure} +% \centering +% % \resizebox{\columnwidth}{!}{\includegraphics*{figure1.pdf}} +% \includegraphics[width=\columnwidth]{pyblock.pdf} +% \script{pyblock.py} +% \caption{Comparison of the distribution of $\Dest$ (a, c, \& e) and the estimated variances (b, d, \& f) from 4096 individual random walk simulations using variances approximated by Eqn.~\ref{equ:varestMSD} (a \& b), from block averaging with fitting to Eqn.~\ref{equ:variance} (c \& d), and without fitting (e \& f). Note that the bounds in a, c, \& e are defined by the extrema of the estimated values, showing the fact that without the fitting to Eqn.~\ref{equ:variance}, the resulting covariance matrix is numerically unstable.} +% \label{fig:pyblock} +% \end{figure} \section{Bias and skew in $\prob{\varest{\Dest}}$} \label{sec:skew} @@ -793,20 +793,20 @@ \section{Bias and skew in $\prob{\varest{\Dest}}$} Fig.~\ref{fig:true_cov} presents equivalent results for $\prob{\Dest}$ and $\prob{\varest{\Dest}}$ for the same \num{4096} individual simulations, but calculated using a numerical covariance matrix, $\bm{\Sigma}_\mathrm{num}$ derived from all \num{4096} observed MSDs. The resulting distribution $\prob{\varest{\Dest}}$ (Fig.~\ref{fig:true_cov}b) is no longer biased or skewed. Furthermore, the distribution $\prob{\Dest}$ agrees even more closely with the numerically converged distribution obtained when combining data from all \num{4096} simulations (Fig~\ref{fig:true_cov}a), contrasting with the results presented in Fig.~\ref{fig:random_walk}b, where our approximate Bayesian regression scheme yields a slightly broadened distribution due to the use of the long-time limit in the derivation of the analytical form for $\bm{\Sigma^\prime}$. -\begin{figure} - \centering - % \resizebox{\columnwidth}{!}{\includegraphics*{figure7.pdf}} - \includegraphics[width=\columnwidth]{true_cov.pdf} - \script{true_cov.py} - \caption{ - (a) Probability distribution of point-estimates $\prob{\Dest}$ obtained from \num{4096} individual random-walk simulations, using the numerical covariance matrix $\bm{\Sigma}_\mathrm{num}$. - Each simulation has been analysed as in Fig.~~\ref{equ:observed_msd}(a) and (b) to yield a single corresponding point estimate $\Dest$. - The grey line shows the distribution of point estimates, $\prob{\Dest_\mathrm{num}}$, obtained using Bayesian regression with a mean vector and numerical covariance matrix derived from the complete dataset of all \num{4096} simulations. - The pink horizontal bar shows an interval of one standard deviation in $\prob{\Dest}$. - (b) Probability distribution of estimated variances, $\varest{\Dest}$, for individual random-walk simulations, using the numerical covariance matrix $\bm{\Sigma}_\mathrm{num}$, compared to the true sample variance (pink vertical line) $\var{\Dest}$. - } - \label{fig:true_cov} -\end{figure} +% \begin{figure} +% \centering +% % \resizebox{\columnwidth}{!}{\includegraphics*{figure7.pdf}} +% \includegraphics[width=\columnwidth]{true_cov.pdf} +% \script{true_cov.py} +% \caption{ +% (a) Probability distribution of point-estimates $\prob{\Dest}$ obtained from \num{4096} individual random-walk simulations, using the numerical covariance matrix $\bm{\Sigma}_\mathrm{num}$. +% Each simulation has been analysed as in Fig.~~\ref{equ:observed_msd}(a) and (b) to yield a single corresponding point estimate $\Dest$. +% The grey line shows the distribution of point estimates, $\prob{\Dest_\mathrm{num}}$, obtained using Bayesian regression with a mean vector and numerical covariance matrix derived from the complete dataset of all \num{4096} simulations. +% The pink horizontal bar shows an interval of one standard deviation in $\prob{\Dest}$. +% (b) Probability distribution of estimated variances, $\varest{\Dest}$, for individual random-walk simulations, using the numerical covariance matrix $\bm{\Sigma}_\mathrm{num}$, compared to the true sample variance (pink vertical line) $\var{\Dest}$. +% } +% \label{fig:true_cov} +% \end{figure} \section{Evaluation of OLS, WLS, and GLS for LLZO System} diff --git a/zenodo.yml b/zenodo.yml index 9f52acb..d8b7033 100644 --- a/zenodo.yml +++ b/zenodo.yml @@ -1,3 +1,4 @@ cache: main: - sandbox: 10.5072/zenodo.2046 \ No newline at end of file + sandbox: 10.5072/zenodo.21503 +