From 156adf2767f90cc4612f7f2d679d9ecfa18e9d46 Mon Sep 17 00:00:00 2001 From: nathan Date: Mon, 3 Jun 2024 20:20:36 +0200 Subject: [PATCH] Move `DistributionDataFrameHistogram` for typing purposes --- src/nomad_simulations/model_system.py | 122 +++++++++++++------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/src/nomad_simulations/model_system.py b/src/nomad_simulations/model_system.py index 62c72bef..3514926a 100644 --- a/src/nomad_simulations/model_system.py +++ b/src/nomad_simulations/model_system.py @@ -503,6 +503,67 @@ def normalize(self, archive, logger: BoundLogger): ) +class DistributionDataFrameHistogram: + def __init__( + self, + el_distrs: pd.DataFrame, + cutoffs: pd.DataFrame, + ll: int, + bins: pint.Quantity, + ) -> None: + self._ll = ll + self._hists: dict[Mode, pd.DataFrame] = {} + self._cutoffs: dict[Mode, pint.Quantity] = {} + protocol = ( + lambda x: self._sparsify(x, bins) + if isinstance(bins.magnitude, (int, float)) + else self._bin(x, bins) + ) + + distr = self._sel_distr_cols(el_distrs) + for mode, df in distr.groupby(distr.columns[:-1]): + self._hists[mode] = self._cmp_hist(protocol(df)) + self._cutoffs[mode] = cutoffs.loc[mode] + + def _sel_distr_cols(self, df: pd.DataFrame) -> pd.DataFrame: + """Select the appropriate elemental combinations.""" + return df[df.count(axis=1) == self._ll].dropna(axis=0) + + def _cmp_hist(self, df: pd.Series) -> pd.DataFrame: + """Return a histogram of the distribution.""" + hist = df[df['value'] > 0] + hist = hist.groupby('value').size().reset_index(name='count') + hist['freq'] = hist['count'].apply(lambda x: x / hist['count'].min()) + return hist + + def _sparsify(self, df: pd.DataFrame, prec: pint.Quantity) -> pd.DataFrame: + """Sparsify the distribution by rounding the values to the nearest `prec`.""" + return df['value'].map(lambda x: math.floor(x / prec) * prec, inplace=True) + + def _bin(self, df: pd.DataFrame, binning: pint.Quantity) -> pd.DataFrame: + """Bin the distribution by the `binning` values.""" + return df['value'].map(lambda x: binning(np.min(np.where(x > binning)))) + + def get(self, mode: Mode) -> tuple[pint.Quantity, pd.DataFrame]: + return self._cutoffs[mode], self._hists[mode] + + def to_nomad(self) -> list[GeometryDistribution]: + constructor_map = { + 2: DistanceGeometryDistribution, + 3: AngleGeometryDistribution, + 4: DihedralGeometryDistribution, + } + return [ + constructor_map[self._ll]( + element_cutoff_selection=mode, + distance_cutoffs=self._cutoffs[mode], + bins=self._hists[mode]['value'], + frequencies=self._hists[mode]['freq'], + ) + for mode in self._hists.keys() + ] + + class DistributionDataFrame: def __init__( self, @@ -619,67 +680,6 @@ def to_hist(self, ll: int, bins: pint.Quantity) -> DistributionDataFrameHistogra ) -class DistributionDataFrameHistogram: - def __init__( - self, - el_distrs: pd.DataFrame, - cutoffs: pd.DataFrame, - ll: int, - bins: pint.Quantity, - ) -> None: - self._ll = ll - self._hists: dict[Mode, pd.DataFrame] = {} - self._cutoffs: dict[Mode, pint.Quantity] = {} - protocol = ( - lambda x: self._sparsify(x, bins) - if isinstance(bins.magnitude, (int, float)) - else self._bin(x, bins) - ) - - distr = self._sel_distr_cols(el_distrs) - for mode, df in distr.groupby(distr.columns[:-1]): - self._hists[mode] = self._cmp_hist(protocol(df)) - self._cutoffs[mode] = cutoffs.loc[mode] - - def _sel_distr_cols(self, df: pd.DataFrame) -> pd.DataFrame: - """Select the appropriate elemental combinations.""" - return df[df.count(axis=1) == self._ll].dropna(axis=0) - - def _cmp_hist(self, df: pd.Series) -> pd.DataFrame: - """Return a histogram of the distribution.""" - hist = df[df['value'] > 0] - hist = hist.groupby('value').size().reset_index(name='count') - hist['freq'] = hist['count'].apply(lambda x: x / hist['count'].min()) - return hist - - def _sparsify(self, df: pd.DataFrame, prec: pint.Quantity) -> pd.DataFrame: - """Sparsify the distribution by rounding the values to the nearest `prec`.""" - return df['value'].map(lambda x: math.floor(x / prec) * prec, inplace=True) - - def _bin(self, df: pd.DataFrame, binning: pint.Quantity) -> pd.DataFrame: - """Bin the distribution by the `binning` values.""" - return df['value'].map(lambda x: binning(np.min(np.where(x > binning)))) - - def get(self, mode: Mode) -> tuple[pint.Quantity, pd.DataFrame]: - return self._cutoffs[mode], self._hists[mode] - - def to_nomad(self) -> list[GeometryDistribution]: - constructor_map = { - 2: DistanceGeometryDistribution, - 3: AngleGeometryDistribution, - 4: DihedralGeometryDistribution, - } - return [ - constructor_map[self._ll]( - element_cutoff_selection=mode, - distance_cutoffs=self._cutoffs[mode], - bins=self._hists[mode]['value'], - frequencies=self._hists[mode]['freq'], - ) - for mode in self._hists.keys() - ] - - class AtomicCell(Cell): """ A base section used to specify the atomic cell information of a system.