Skip to content

Commit

Permalink
fixed issue leading to truncated variable level labels
Browse files Browse the repository at this point in the history
  • Loading branch information
MamadouSDiallo committed Jan 1, 2025
1 parent 3387bc6 commit 5b97860
Show file tree
Hide file tree
Showing 55 changed files with 1,083 additions and 477 deletions.
21 changes: 8 additions & 13 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "samplics"
version = "0.4.22"
version = "0.4.23"
# license = "MIT"
description = "Select, weight and analyze complex sample data"

Expand Down Expand Up @@ -78,7 +78,10 @@ addopts = "--ignore=tests/apis --ignore=tests/sae --ignore=tests/types"
[tool.ruff]
src = ["src", "tests"]
# extend-exclude = ["tests"]
line-length = 99
target-version = "py313"

[tool.ruff.lint]
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
select = ["D", "E", "F", "W", "I001"]
ignore = ["D", "E501"] # for development we can ignore the docs
Expand Down Expand Up @@ -112,30 +115,22 @@ exclude = [
]
per-file-ignores = {}

# Same as Black.
line-length = 99

# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

# Assume Python 3.13.
target-version = "py313"

[tool.ruff.pycodestyle]
[tool.ruff.lint.pycodestyle]
ignore-overlong-task-comments = true

[tool.ruff.mccabe]
[tool.ruff.lint.mccabe]
# Unlike Flake8, default to a complexity level of 10.
max-complexity = 10


[tool.ruff.isort]
[tool.ruff.lint.isort]
known-first-party = ["samplics"]
known-third-party = ["matplotlib", "numpy", "pandas", "polars", "pydantic", "scipy", "statsmodels"]
lines-after-imports = 2
lines-between-types = 1


[tool.mypy]
ignore_missing_imports = true
follow_imports = "silent"
Expand All @@ -162,4 +157,4 @@ exclude = [".venv", "tests"]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
build-backend = "hatchling.build"
2 changes: 1 addition & 1 deletion src/samplics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,4 @@
"Sample",
]

__version__ = "0.4.22"
__version__ = "0.4.23"
4 changes: 3 additions & 1 deletion src/samplics/apis/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ def predict(
intercept: bool = True, # if True, it adds an intercept of 1
b_const: DictStrNum | Number = 1.0,
):
return _predict_eblup(x=x, fit_eblup=fit_stats, y=y, intercept=intercept, b_const=b_const)
return _predict_eblup(
x=x, fit_eblup=fit_stats, y=y, intercept=intercept, b_const=b_const
)
102 changes: 66 additions & 36 deletions src/samplics/categorical/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,24 @@
from samplics.utils.basic_functions import set_variables_names
from samplics.utils.checks import assert_probabilities
from samplics.utils.formats import numpy_array
from samplics.utils.types import Array, Number, Series, SinglePSUEst, StringNumber, PopParam
from samplics.utils.types import (
Array,
Number,
PopParam,
Series,
SinglePSUEst,
StringNumber,
)


class Ttest:
def __init__(self, samp_type: str, paired: bool = False, alpha: float = 0.05) -> None:
def __init__(
self, samp_type: str, paired: bool = False, alpha: float = 0.05
) -> None:
if samp_type.lower() not in ("one-sample", "two-sample"):
raise ValueError("Parameter 'type' must be equal to 'one-sample', 'two-sample'!")
raise ValueError(
"Parameter 'type' must be equal to 'one-sample', 'two-sample'!"
)
assert_probabilities(x=alpha)

self.samp_type = samp_type.lower()
Expand All @@ -51,17 +62,27 @@ def __str__(self) -> str:
return "No table to display"
else:
tbl_head = f"Design-based {self.samp_type.title()} T-test"
if (self.samp_type == "one-sample" and self.group_names == []) or self.paired:
if (
self.samp_type == "one-sample" and self.group_names == []
) or self.paired:
if self.samp_type == "one-sample":
tbl_subhead1 = f" Null hypothesis (Ho): mean = {self.stats['known_mean']}"
tbl_subhead1 = (
f" Null hypothesis (Ho): mean = {self.stats['known_mean']}"
)
else:
tbl_subhead1 = f" Null hypothesis (Ho): mean(Diff = {self.vars_names[0]} - {self.vars_names[1]}) = 0"
tbl_subhead2 = f" t statistics: {self.stats['t']:.4f}"
tbl_subhead3 = f" Degrees of freedom: {self.stats['df']:.2f}"
tbl_subhead4 = " Alternative hypothesis (Ha):"
tbl_subhead4a = f" Prob(T < t) = {self.stats['p_value']['less_than']:.4f}"
tbl_subhead4b = f" Prob(|T| > |t|) = {self.stats['p_value']['not_equal']:.4f}"
tbl_subhead4c = f" Prob(T > t) = {self.stats['p_value']['greater_than']:.4f}"
tbl_subhead4a = (
f" Prob(T < t) = {self.stats['p_value']['less_than']:.4f}"
)
tbl_subhead4b = (
f" Prob(|T| > |t|) = {self.stats['p_value']['not_equal']:.4f}"
)
tbl_subhead4c = (
f" Prob(T > t) = {self.stats['p_value']['greater_than']:.4f}"
)

return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n{tbl_subhead4a}\n{tbl_subhead4b}\n{tbl_subhead4c} \n\n{self.to_dataframe().to_string(index=False)}\n"

Expand All @@ -71,30 +92,22 @@ def __str__(self) -> str:
tbl_subhead1 = f" Null hypothesis (Ho): mean({self.group_names[0]}) = mean({self.group_names[1]}) "
tbl_subhead2 = " Equal variance assumption:"
tbl_subhead2a = f" t statistics: {self.stats['t_eq_variance']:.4f}"
tbl_subhead2b = f" Degrees of freedom: {self.stats['df_eq_variance']:.2f}"
tbl_subhead3 = " Alternative hypothesis (Ha):"
tbl_subhead3a = (
f" Prob(T < t) = {self.stats['p_value_eq_variance']['less_than']:.4f}"
)
tbl_subhead3b = (
f" Prob(|T| > |t|) = {self.stats['p_value_eq_variance']['not_equal']:.4f}"
)
tbl_subhead3c = (
f" Prob(T > t) = {self.stats['p_value_eq_variance']['greater_than']:.4f}"
tbl_subhead2b = (
f" Degrees of freedom: {self.stats['df_eq_variance']:.2f}"
)
tbl_subhead3 = " Alternative hypothesis (Ha):"
tbl_subhead3a = f" Prob(T < t) = {self.stats['p_value_eq_variance']['less_than']:.4f}"
tbl_subhead3b = f" Prob(|T| > |t|) = {self.stats['p_value_eq_variance']['not_equal']:.4f}"
tbl_subhead3c = f" Prob(T > t) = {self.stats['p_value_eq_variance']['greater_than']:.4f}"
tbl_subhead4 = " Unequal variance assumption:"
tbl_subhead4a = f" t statistics: {self.stats['t_uneq_variance']:.4f}"
tbl_subhead4b = f" Degrees of freedom: {self.stats['df_uneq_variance']:.2f}"
tbl_subhead5 = " Alternative hypothesis (Ha):"
tbl_subhead5a = (
f" Prob(T < t) = {self.stats['p_value_uneq_variance']['less_than']:.4f}"
)
tbl_subhead5b = (
f" Prob(|T| > |t|) = {self.stats['p_value_uneq_variance']['not_equal']:.4f}"
)
tbl_subhead5c = (
f" Prob(T > t) = {self.stats['p_value_uneq_variance']['greater_than']:.4f}"
tbl_subhead4b = (
f" Degrees of freedom: {self.stats['df_uneq_variance']:.2f}"
)
tbl_subhead5 = " Alternative hypothesis (Ha):"
tbl_subhead5a = f" Prob(T < t) = {self.stats['p_value_uneq_variance']['less_than']:.4f}"
tbl_subhead5b = f" Prob(|T| > |t|) = {self.stats['p_value_uneq_variance']['not_equal']:.4f}"
tbl_subhead5c = f" Prob(T > t) = {self.stats['p_value_uneq_variance']['greater_than']:.4f}"

return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead2a}\n{tbl_subhead2b}\n{tbl_subhead3}\n{tbl_subhead3a}\n{tbl_subhead3b}\n{tbl_subhead3c}\n{tbl_subhead4}\n{tbl_subhead4a}\n{tbl_subhead4b}\n{tbl_subhead5}\n{tbl_subhead5a}\n{tbl_subhead5b}\n{tbl_subhead5c} \n\n{self.to_dataframe().to_string(index=False)}\n"
else:
Expand All @@ -110,7 +123,9 @@ def _one_sample_one_group(
ssu: Array,
fpc: Union[Dict, float] = 1,
coef_var: bool = False,
single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
single_psu: Union[
SinglePSUEst, dict[StringNumber, SinglePSUEst]
] = SinglePSUEst.error,
strata_comb: Optional[dict[Array, Array]] = None,
) -> None:
one_sample = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
Expand Down Expand Up @@ -179,7 +194,10 @@ def _two_groups_unpaired(

t_equal_variance = (mean_group1 - mean_group2) / (
math.sqrt(
((nb_obs_group1 - 1) * stddev_group1**2 + (nb_obs_group2 - 1) * stddev_group2**2)
(
(nb_obs_group1 - 1) * stddev_group1**2
+ (nb_obs_group2 - 1) * stddev_group2**2
)
/ (nb_obs_group1 + nb_obs_group2 - 2)
)
* math.sqrt(1 / nb_obs_group1 + 1 / nb_obs_group2)
Expand All @@ -199,10 +217,14 @@ def _two_groups_unpaired(
)

left_p_value_equal_variance = t.cdf(t_equal_variance, t_df_equal_variance)
both_p_value_equal_variance = 2 * t.cdf(-abs(t_equal_variance), t_df_equal_variance)
both_p_value_equal_variance = 2 * t.cdf(
-abs(t_equal_variance), t_df_equal_variance
)

left_p_value_unequal_variance = t.cdf(t_unequal_variance, t_df_unequal_variance)
both_p_value_unequal_variance = 2 * t.cdf(-abs(t_unequal_variance), t_df_unequal_variance)
both_p_value_unequal_variance = 2 * t.cdf(
-abs(t_unequal_variance), t_df_unequal_variance
)

stats = {
"number_obs": {group1: nb_obs_group1, group2: nb_obs_group2},
Expand Down Expand Up @@ -255,7 +277,9 @@ def _two_samples_unpaired(
ssu: Optional[Array] = None,
fpc: Union[Dict, float] = 1,
coef_var: bool = False,
single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
single_psu: Union[
SinglePSUEst, dict[StringNumber, SinglePSUEst]
] = SinglePSUEst.error,
strata_comb: Optional[dict[Array, Array]] = None,
) -> None:
two_samples_unpaired = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
Expand Down Expand Up @@ -285,7 +309,9 @@ def compare(
ssu: Optional[Array] = None,
fpc: Union[Dict, float] = 1,
coef_var: bool = False,
single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error,
single_psu: Union[
SinglePSUEst, dict[StringNumber, SinglePSUEst]
] = SinglePSUEst.error,
strata_comb: Optional[dict[Array, Array]] = None,
remove_nan: bool = False,
) -> None:
Expand All @@ -294,7 +320,9 @@ def compare(
if known_mean is None and group is None:
raise AssertionError("Parameters 'known_mean' or 'group' must be provided!")
if known_mean is not None and group is not None:
raise AssertionError("Only one parameter 'known_mean' or 'group' should be provided!")
raise AssertionError(
"Only one parameter 'known_mean' or 'group' should be provided!"
)

if varnames is None:
self.vars_names = set_variables_names(y, None, "var")
Expand Down Expand Up @@ -362,7 +390,9 @@ def compare(
strata_comb=strata_comb,
)

two_samples_unpaired = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
two_samples_unpaired = TaylorEstimator(
param=PopParam.mean, alpha=self.alpha
)
two_samples_unpaired.estimate(
y=_y,
by=_group,
Expand Down
56 changes: 17 additions & 39 deletions src/samplics/categorical/tabulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from samplics.estimation import TaylorEstimator
from samplics.utils.basic_functions import set_variables_names
from samplics.utils.errors import DimensionError
from samplics.utils.formats import concatenate_series_to_str, numpy_array, remove_nans
from samplics.utils.types import Array, Number, SinglePSUEst, StringNumber, PopParam
from samplics.utils.formats import numpy_array, remove_nans
from samplics.utils.types import Array, Number, PopParam, SinglePSUEst, StringNumber


class Tabulation:
Expand All @@ -30,7 +30,7 @@ def __init__(
alpha: float = 0.05,
ciprop_method: str = "logit",
) -> None:
if not param in (PopParam.count, PopParam.prop):
if param not in (PopParam.count, PopParam.prop):
raise ValueError("Parameter must be 'count' or 'proportion'!")
self.param = param
self.type = "oneway"
Expand Down Expand Up @@ -89,9 +89,7 @@ def _estimate(
to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var.values.ravel())
elif var.ndim == 2: # DataFrame
for col in var.columns:
to_keep = to_keep & remove_nans(
var.values.ravel().shape[0], var[col].values.ravel()
)
to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var[col].values.ravel())
else:
raise DimensionError("The dimension must be 1 or 2.")

Expand Down Expand Up @@ -180,20 +178,12 @@ def tabulate(
vars_names = set_variables_names(vars, varnames, prefix)

if len(vars_names) != nb_vars:
raise AssertionError(
"Length of varnames must be the same as the number of columns of vars"
)
raise AssertionError("Length of varnames must be the same as the number of columns of vars")

_samp_weight = numpy_array(samp_weight)

_samp_weight = (
np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight
)
_samp_weight = (
np.repeat(_samp_weight, vars_df.shape[0])
if _samp_weight.shape[0] == 1
else _samp_weight
)
_samp_weight = np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight
_samp_weight = np.repeat(_samp_weight, vars_df.shape[0]) if _samp_weight.shape[0] == 1 else _samp_weight
_stratum = numpy_array(stratum)
_psu = numpy_array(psu)
_ssu = numpy_array(ssu)
Expand Down Expand Up @@ -314,7 +304,7 @@ def __init__(
alpha: float = 0.05,
ciprop_method: str = "logit",
) -> None:
if not param in (PopParam.count, PopParam.prop):
if param not in (PopParam.count, PopParam.prop):
raise ValueError("Parameter must be 'count' or 'proportion'!")
self.param = param
self.type = "twoway"
Expand Down Expand Up @@ -351,9 +341,7 @@ def __str__(self) -> str:

pearson_unadj = f"Unadjusted - {chisq_dist}: {self.stats['Pearson-Unadj']['chisq_value']:.4f} with p-value of {self.stats['Pearson-Unadj']['p_value']:.4f}"
pearson_adj = f"Adjusted - {f_dist}): {self.stats['Pearson-Adj']['f_value']:.4f} with p-value of {self.stats['Pearson-Adj']['p_value']:.4f}"
pearson_test = (
f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}"
)
pearson_test = f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}"

lr_unadj = f" Unadjusted - {chisq_dist}: {self.stats['LR-Unadj']['chisq_value']:.4f} with p-value of {self.stats['LR-Unadj']['p_value']:.4f}"
lr_adj = f" Adjusted - {f_dist}): {self.stats['LR-Adj']['f_value']:.4f} with p-value of {self.stats['LR-Adj']['p_value']:.4f}"
Expand All @@ -362,9 +350,7 @@ def __str__(self) -> str:
return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n\n{pearson_test}\n\n {lr_test}\n"

# also mutates tbl_est
def _extract_estimates(
self, tbl_est, vars_levels
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
def _extract_estimates(self, tbl_est, vars_levels) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
levels = list(tbl_est.point_est.keys())
missing_levels = vars_levels[~np.isin(vars_levels, levels)]
if missing_levels.shape[0] > 0:
Expand Down Expand Up @@ -449,9 +435,7 @@ def tabulate(
# vars_nans = vars.isna()
# excluded_units = vars_nans.iloc[:, 0] | vars_nans.iloc[:, 1]
to_keep = remove_nans(vars.shape[0], vars.iloc[:, 0].values, vars.iloc[:, 1].values)
samp_weight = (
samp_weight[to_keep] if samp_weight.shape not in ((), (0,)) else samp_weight
)
samp_weight = samp_weight[to_keep] if samp_weight.shape not in ((), (0,)) else samp_weight
stratum = stratum[to_keep] if stratum.shape not in ((), (0,)) else stratum
psu = psu[to_keep] if psu.shape not in ((), (0,)) else psu
ssu = ssu[to_keep] if ssu.shape not in ((), (0,)) else ssu
Expand Down Expand Up @@ -489,15 +473,13 @@ def tabulate(
# vars_dummies = np.delete(vars_dummies, obj=2, axis=1)

if len(vars.shape) == 2:
vars_for_oneway = np.apply_along_axis(
func1d=concatenate_series_to_str, axis=1, arr=vars
)
# vars_for_oneway = np.apply_along_axis(func1d=concatenate_series_to_str, axis=1, arr=vars)
vars_for_oneway = vars.agg("__by__".join, axis=1).values
else:
vars_for_oneway = vars

vars_levels_concat = np.apply_along_axis(
func1d=concatenate_series_to_str, axis=1, arr=vars_levels
)
# vars_levels_concat = np.apply_along_axis(func1d=concatenate_series_to_str, axis=1, arr=vars_levels)
vars_levels_concat = vars_levels.agg("__by__".join, axis=1).values

tbl_est_prop = TaylorEstimator(param=PopParam.mean, alpha=self.alpha)
tbl_est_prop.estimate(
Expand Down Expand Up @@ -617,13 +599,9 @@ def tabulate(
if self.param == PopParam.count:
point_est_df = point_est_df / np.sum(point_est_df)

point_est_null = point_est_df.sum(axis=1).reshape(nrows, 1) @ point_est_df.sum(
axis=0
).reshape(1, ncols)
point_est_null = point_est_df.sum(axis=1).reshape(nrows, 1) @ point_est_df.sum(axis=0).reshape(1, ncols)

chisq_p = float(
vars.shape[0] * np.sum((point_est_df - point_est_null) ** 2 / point_est_null)
)
chisq_p = float(vars.shape[0] * np.sum((point_est_df - point_est_null) ** 2 / point_est_null))

# valid indexes (i,j) correspond to n_ij > 0
valid_indx = (point_est_df != 0) & (point_est_null != 0)
Expand Down
Loading

0 comments on commit 5b97860

Please sign in to comment.