fixed issue leading to truncated variable level labels

samplics-org · Jan 1, 2025 · 5b97860 · 5b97860
1 parent 3387bc6
commit 5b97860
Show file tree

Hide file tree

Showing 55 changed files with 1,083 additions and 477 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "samplics"
-version = "0.4.22"
+version = "0.4.23"
 # license = "MIT"
 description = "Select, weight and analyze complex sample data"
 
@@ -78,7 +78,10 @@ addopts = "--ignore=tests/apis --ignore=tests/sae --ignore=tests/types"
 [tool.ruff]
 src = ["src", "tests"]
 # extend-exclude = ["tests"]
+line-length = 99
+target-version =  "py313"
 
+[tool.ruff.lint]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
 select = ["D", "E", "F", "W", "I001"]
 ignore = ["D", "E501"] # for development we can ignore the docs
@@ -112,30 +115,22 @@ exclude = [
 ]
 per-file-ignores = {}
 
-# Same as Black.
-line-length = 99
-
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
-# Assume Python 3.13.
-target-version =  "py313"
-
-[tool.ruff.pycodestyle]
+[tool.ruff.lint.pycodestyle]
 ignore-overlong-task-comments = true
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 # Unlike Flake8, default to a complexity level of 10.
 max-complexity = 10
 
-
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["samplics"]
 known-third-party = ["matplotlib", "numpy",  "pandas", "polars", "pydantic", "scipy", "statsmodels"]
 lines-after-imports = 2
 lines-between-types = 1
 
-
 [tool.mypy]
 ignore_missing_imports = true
 follow_imports = "silent"
@@ -162,4 +157,4 @@ exclude = [".venv",  "tests"]
 
 [build-system]
 requires = ["hatchling"]
-build-backend = "hatchling.build"
+build-backend = "hatchling.build"
diff --git a/src/samplics/__init__.py b/src/samplics/__init__.py
@@ -143,4 +143,4 @@
     "Sample",
 ]
 
-__version__ = "0.4.22"
+__version__ = "0.4.23"
diff --git a/src/samplics/apis/predict.py b/src/samplics/apis/predict.py
@@ -9,4 +9,6 @@ def predict(
     intercept: bool = True,  # if True, it adds an intercept of 1
     b_const: DictStrNum | Number = 1.0,
 ):
-    return _predict_eblup(x=x, fit_eblup=fit_stats, y=y, intercept=intercept, b_const=b_const)
+    return _predict_eblup(
+        x=x, fit_eblup=fit_stats, y=y, intercept=intercept, b_const=b_const
+    )
diff --git a/src/samplics/categorical/comparison.py b/src/samplics/categorical/comparison.py
@@ -19,13 +19,24 @@
 from samplics.utils.basic_functions import set_variables_names
 from samplics.utils.checks import assert_probabilities
 from samplics.utils.formats import numpy_array
-from samplics.utils.types import Array, Number, Series, SinglePSUEst, StringNumber, PopParam
+from samplics.utils.types import (
+    Array,
+    Number,
+    PopParam,
+    Series,
+    SinglePSUEst,
+    StringNumber,
+)
 
 
 class Ttest:
-    def __init__(self, samp_type: str, paired: bool = False, alpha: float = 0.05) -> None:
+    def __init__(
+        self, samp_type: str, paired: bool = False, alpha: float = 0.05
+    ) -> None:
         if samp_type.lower() not in ("one-sample", "two-sample"):
-            raise ValueError("Parameter 'type' must be equal to 'one-sample', 'two-sample'!")
+            raise ValueError(
+                "Parameter 'type' must be equal to 'one-sample', 'two-sample'!"
+            )
         assert_probabilities(x=alpha)
 
         self.samp_type = samp_type.lower()
@@ -51,17 +62,27 @@ def __str__(self) -> str:
             return "No table to display"
         else:
             tbl_head = f"Design-based {self.samp_type.title()} T-test"
-            if (self.samp_type == "one-sample" and self.group_names == []) or self.paired:
+            if (
+                self.samp_type == "one-sample" and self.group_names == []
+            ) or self.paired:
                 if self.samp_type == "one-sample":
-                    tbl_subhead1 = f" Null hypothesis (Ho): mean = {self.stats['known_mean']}"
+                    tbl_subhead1 = (
+                        f" Null hypothesis (Ho): mean = {self.stats['known_mean']}"
+                    )
                 else:
                     tbl_subhead1 = f" Null hypothesis (Ho): mean(Diff = {self.vars_names[0]} - {self.vars_names[1]}) = 0"
                 tbl_subhead2 = f" t statistics: {self.stats['t']:.4f}"
                 tbl_subhead3 = f" Degrees of freedom: {self.stats['df']:.2f}"
                 tbl_subhead4 = " Alternative hypothesis (Ha):"
-                tbl_subhead4a = f"  Prob(T < t) = {self.stats['p_value']['less_than']:.4f}"
-                tbl_subhead4b = f"  Prob(|T| > |t|) = {self.stats['p_value']['not_equal']:.4f}"
-                tbl_subhead4c = f"  Prob(T > t) = {self.stats['p_value']['greater_than']:.4f}"
+                tbl_subhead4a = (
+                    f"  Prob(T < t) = {self.stats['p_value']['less_than']:.4f}"
+                )
+                tbl_subhead4b = (
+                    f"  Prob(|T| > |t|) = {self.stats['p_value']['not_equal']:.4f}"
+                )
+                tbl_subhead4c = (
+                    f"  Prob(T > t) = {self.stats['p_value']['greater_than']:.4f}"
+                )
 
                 return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n{tbl_subhead4a}\n{tbl_subhead4b}\n{tbl_subhead4c} \n\n{self.to_dataframe().to_string(index=False)}\n"
 
@@ -71,30 +92,22 @@ def __str__(self) -> str:
                 tbl_subhead1 = f" Null hypothesis (Ho): mean({self.group_names[0]}) = mean({self.group_names[1]}) "
                 tbl_subhead2 = " Equal variance assumption:"
                 tbl_subhead2a = f"  t statistics: {self.stats['t_eq_variance']:.4f}"
-                tbl_subhead2b = f"  Degrees of freedom: {self.stats['df_eq_variance']:.2f}"
-                tbl_subhead3 = "  Alternative hypothesis (Ha):"
-                tbl_subhead3a = (
-                    f"   Prob(T < t) = {self.stats['p_value_eq_variance']['less_than']:.4f}"
-                )
-                tbl_subhead3b = (
-                    f"   Prob(|T| > |t|) = {self.stats['p_value_eq_variance']['not_equal']:.4f}"
-                )
-                tbl_subhead3c = (
-                    f"   Prob(T > t) = {self.stats['p_value_eq_variance']['greater_than']:.4f}"
+                tbl_subhead2b = (
+                    f"  Degrees of freedom: {self.stats['df_eq_variance']:.2f}"
                 )
+                tbl_subhead3 = "  Alternative hypothesis (Ha):"
+                tbl_subhead3a = f"   Prob(T < t) = {self.stats['p_value_eq_variance']['less_than']:.4f}"
+                tbl_subhead3b = f"   Prob(|T| > |t|) = {self.stats['p_value_eq_variance']['not_equal']:.4f}"
+                tbl_subhead3c = f"   Prob(T > t) = {self.stats['p_value_eq_variance']['greater_than']:.4f}"
                 tbl_subhead4 = " Unequal variance assumption:"
                 tbl_subhead4a = f"  t statistics: {self.stats['t_uneq_variance']:.4f}"
-                tbl_subhead4b = f"  Degrees of freedom: {self.stats['df_uneq_variance']:.2f}"
-                tbl_subhead5 = "  Alternative hypothesis (Ha):"
-                tbl_subhead5a = (
-                    f"   Prob(T < t) = {self.stats['p_value_uneq_variance']['less_than']:.4f}"
-                )
-                tbl_subhead5b = (
-                    f"   Prob(|T| > |t|) = {self.stats['p_value_uneq_variance']['not_equal']:.4f}"
-                )
-                tbl_subhead5c = (
-                    f"   Prob(T > t) = {self.stats['p_value_uneq_variance']['greater_than']:.4f}"
+                tbl_subhead4b = (
+                    f"  Degrees of freedom: {self.stats['df_uneq_variance']:.2f}"
                 )
+                tbl_subhead5 = "  Alternative hypothesis (Ha):"
+                tbl_subhead5a = f"   Prob(T < t) = {self.stats['p_value_uneq_variance']['less_than']:.4f}"
+                tbl_subhead5b = f"   Prob(|T| > |t|) = {self.stats['p_value_uneq_variance']['not_equal']:.4f}"
+                tbl_subhead5c = f"   Prob(T > t) = {self.stats['p_value_uneq_variance']['greater_than']:.4f}"
 
                 return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead2a}\n{tbl_subhead2b}\n{tbl_subhead3}\n{tbl_subhead3a}\n{tbl_subhead3b}\n{tbl_subhead3c}\n{tbl_subhead4}\n{tbl_subhead4a}\n{tbl_subhead4b}\n{tbl_subhead5}\n{tbl_subhead5a}\n{tbl_subhead5b}\n{tbl_subhead5c} \n\n{self.to_dataframe().to_string(index=False)}\n"
             else:
@@ -110,7 +123,9 @@ def _one_sample_one_group(
         ssu: Array,
         fpc: Union[Dict, float] = 1,
         coef_var: bool = False,
-        single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
+        single_psu: Union[
+            SinglePSUEst, dict[StringNumber, SinglePSUEst]
+        ] = SinglePSUEst.error,
         strata_comb: Optional[dict[Array, Array]] = None,
     ) -> None:
         one_sample = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
@@ -179,7 +194,10 @@ def _two_groups_unpaired(
 
         t_equal_variance = (mean_group1 - mean_group2) / (
             math.sqrt(
-                ((nb_obs_group1 - 1) * stddev_group1**2 + (nb_obs_group2 - 1) * stddev_group2**2)
+                (
+                    (nb_obs_group1 - 1) * stddev_group1**2
+                    + (nb_obs_group2 - 1) * stddev_group2**2
+                )
                 / (nb_obs_group1 + nb_obs_group2 - 2)
             )
             * math.sqrt(1 / nb_obs_group1 + 1 / nb_obs_group2)
@@ -199,10 +217,14 @@ def _two_groups_unpaired(
         )
 
         left_p_value_equal_variance = t.cdf(t_equal_variance, t_df_equal_variance)
-        both_p_value_equal_variance = 2 * t.cdf(-abs(t_equal_variance), t_df_equal_variance)
+        both_p_value_equal_variance = 2 * t.cdf(
+            -abs(t_equal_variance), t_df_equal_variance
+        )
 
         left_p_value_unequal_variance = t.cdf(t_unequal_variance, t_df_unequal_variance)
-        both_p_value_unequal_variance = 2 * t.cdf(-abs(t_unequal_variance), t_df_unequal_variance)
+        both_p_value_unequal_variance = 2 * t.cdf(
+            -abs(t_unequal_variance), t_df_unequal_variance
+        )
 
         stats = {
             "number_obs": {group1: nb_obs_group1, group2: nb_obs_group2},
@@ -255,7 +277,9 @@ def _two_samples_unpaired(
         ssu: Optional[Array] = None,
         fpc: Union[Dict, float] = 1,
         coef_var: bool = False,
-        single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
+        single_psu: Union[
+            SinglePSUEst, dict[StringNumber, SinglePSUEst]
+        ] = SinglePSUEst.error,
         strata_comb: Optional[dict[Array, Array]] = None,
     ) -> None:
         two_samples_unpaired = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
@@ -285,7 +309,9 @@ def compare(
         ssu: Optional[Array] = None,
         fpc: Union[Dict, float] = 1,
         coef_var: bool = False,
-        single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
+        single_psu: Union[
+            SinglePSUEst, dict[StringNumber, SinglePSUEst]
+        ] = SinglePSUEst.error,
         strata_comb: Optional[dict[Array, Array]] = None,
         remove_nan: bool = False,
     ) -> None:
@@ -294,7 +320,9 @@ def compare(
         if known_mean is None and group is None:
             raise AssertionError("Parameters 'known_mean' or 'group' must be provided!")
         if known_mean is not None and group is not None:
-            raise AssertionError("Only one parameter 'known_mean' or 'group' should be provided!")
+            raise AssertionError(
+                "Only one parameter 'known_mean' or 'group' should be provided!"
+            )
 
         if varnames is None:
             self.vars_names = set_variables_names(y, None, "var")
@@ -362,7 +390,9 @@ def compare(
                 strata_comb=strata_comb,
             )
 
-            two_samples_unpaired = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
+            two_samples_unpaired = TaylorEstimator(
+                param=PopParam.mean, alpha=self.alpha
+            )
             two_samples_unpaired.estimate(
                 y=_y,
                 by=_group,

diff --git a/src/samplics/categorical/tabulation.py b/src/samplics/categorical/tabulation.py
@@ -19,8 +19,8 @@
 from samplics.estimation import TaylorEstimator
 from samplics.utils.basic_functions import set_variables_names
 from samplics.utils.errors import DimensionError
-from samplics.utils.formats import concatenate_series_to_str, numpy_array, remove_nans
-from samplics.utils.types import Array, Number, SinglePSUEst, StringNumber, PopParam
+from samplics.utils.formats import numpy_array, remove_nans
+from samplics.utils.types import Array, Number, PopParam, SinglePSUEst, StringNumber
 
 
 class Tabulation:
@@ -30,7 +30,7 @@ def __init__(
         alpha: float = 0.05,
         ciprop_method: str = "logit",
     ) -> None:
-        if not param in (PopParam.count, PopParam.prop):
+        if param not in (PopParam.count, PopParam.prop):
             raise ValueError("Parameter must be 'count' or 'proportion'!")
         self.param = param
         self.type = "oneway"
@@ -89,9 +89,7 @@ def _estimate(
                 to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var.values.ravel())
             elif var.ndim == 2:  # DataFrame
                 for col in var.columns:
-                    to_keep = to_keep & remove_nans(
-                        var.values.ravel().shape[0], var[col].values.ravel()
-                    )
+                    to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var[col].values.ravel())
             else:
                 raise DimensionError("The dimension must be 1 or 2.")
 
@@ -180,20 +178,12 @@ def tabulate(
         vars_names = set_variables_names(vars, varnames, prefix)
 
         if len(vars_names) != nb_vars:
-            raise AssertionError(
-                "Length of varnames must be the same as the number of columns of vars"
-            )
+            raise AssertionError("Length of varnames must be the same as the number of columns of vars")
 
         _samp_weight = numpy_array(samp_weight)
 
-        _samp_weight = (
-            np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight
-        )
-        _samp_weight = (
-            np.repeat(_samp_weight, vars_df.shape[0])
-            if _samp_weight.shape[0] == 1
-            else _samp_weight
-        )
+        _samp_weight = np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight
+        _samp_weight = np.repeat(_samp_weight, vars_df.shape[0]) if _samp_weight.shape[0] == 1 else _samp_weight
         _stratum = numpy_array(stratum)
         _psu = numpy_array(psu)
         _ssu = numpy_array(ssu)
@@ -314,7 +304,7 @@ def __init__(
         alpha: float = 0.05,
         ciprop_method: str = "logit",
     ) -> None:
-        if not param in (PopParam.count, PopParam.prop):
+        if param not in (PopParam.count, PopParam.prop):
             raise ValueError("Parameter must be 'count' or 'proportion'!")
         self.param = param
         self.type = "twoway"
@@ -351,9 +341,7 @@ def __str__(self) -> str:
 
             pearson_unadj = f"Unadjusted - {chisq_dist}: {self.stats['Pearson-Unadj']['chisq_value']:.4f} with p-value of {self.stats['Pearson-Unadj']['p_value']:.4f}"
             pearson_adj = f"Adjusted - {f_dist}): {self.stats['Pearson-Adj']['f_value']:.4f}  with p-value of {self.stats['Pearson-Adj']['p_value']:.4f}"
-            pearson_test = (
-                f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}"
-            )
+            pearson_test = f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}"
 
             lr_unadj = f" Unadjusted - {chisq_dist}: {self.stats['LR-Unadj']['chisq_value']:.4f} with p-value of {self.stats['LR-Unadj']['p_value']:.4f}"
             lr_adj = f" Adjusted - {f_dist}): {self.stats['LR-Adj']['f_value']:.4f}  with p-value of {self.stats['LR-Adj']['p_value']:.4f}"
@@ -362,9 +350,7 @@ def __str__(self) -> str:
             return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n\n{pearson_test}\n\n {lr_test}\n"
 
     # also mutates tbl_est
-    def _extract_estimates(
-        self, tbl_est, vars_levels
-    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    def _extract_estimates(self, tbl_est, vars_levels) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         levels = list(tbl_est.point_est.keys())
         missing_levels = vars_levels[~np.isin(vars_levels, levels)]
         if missing_levels.shape[0] > 0:
@@ -449,9 +435,7 @@ def tabulate(
             # vars_nans = vars.isna()
             # excluded_units = vars_nans.iloc[:, 0] | vars_nans.iloc[:, 1]
             to_keep = remove_nans(vars.shape[0], vars.iloc[:, 0].values, vars.iloc[:, 1].values)
-            samp_weight = (
-                samp_weight[to_keep] if samp_weight.shape not in ((), (0,)) else samp_weight
-            )
+            samp_weight = samp_weight[to_keep] if samp_weight.shape not in ((), (0,)) else samp_weight
             stratum = stratum[to_keep] if stratum.shape not in ((), (0,)) else stratum
             psu = psu[to_keep] if psu.shape not in ((), (0,)) else psu
             ssu = ssu[to_keep] if ssu.shape not in ((), (0,)) else ssu
@@ -489,15 +473,13 @@ def tabulate(
         # vars_dummies = np.delete(vars_dummies, obj=2, axis=1)
 
         if len(vars.shape) == 2:
-            vars_for_oneway = np.apply_along_axis(
-                func1d=concatenate_series_to_str, axis=1, arr=vars
-            )
+            # vars_for_oneway = np.apply_along_axis(func1d=concatenate_series_to_str, axis=1, arr=vars)
+            vars_for_oneway = vars.agg("__by__".join, axis=1).values
         else:
             vars_for_oneway = vars
 
-        vars_levels_concat = np.apply_along_axis(
-            func1d=concatenate_series_to_str, axis=1, arr=vars_levels
-        )
+        # vars_levels_concat = np.apply_along_axis(func1d=concatenate_series_to_str, axis=1, arr=vars_levels)
+        vars_levels_concat = vars_levels.agg("__by__".join, axis=1).values
 
         tbl_est_prop = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
         tbl_est_prop.estimate(
@@ -617,13 +599,9 @@ def tabulate(
         if self.param == PopParam.count:
             point_est_df = point_est_df / np.sum(point_est_df)
 
-        point_est_null = point_est_df.sum(axis=1).reshape(nrows, 1) @ point_est_df.sum(
-            axis=0
-        ).reshape(1, ncols)
+        point_est_null = point_est_df.sum(axis=1).reshape(nrows, 1) @ point_est_df.sum(axis=0).reshape(1, ncols)
 
-        chisq_p = float(
-            vars.shape[0] * np.sum((point_est_df - point_est_null) ** 2 / point_est_null)
-        )
+        chisq_p = float(vars.shape[0] * np.sum((point_est_df - point_est_null) ** 2 / point_est_null))
 
         # valid indexes (i,j) correspond to n_ij > 0
         valid_indx = (point_est_df != 0) & (point_est_null != 0)