Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

654 and 579 sf expansion for construction and nulls filled for CF #184

Merged
merged 3 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.6.1"
__version__ = "0.6.2"
4 changes: 2 additions & 2 deletions src/developer_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ global:
load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
load_ni_data: False
load_historic_data: False
run_construction: False
run_construction: True
run_ni_construction: False
load_manual_outliers: False
load_manual_imputation: False
Expand Down Expand Up @@ -87,7 +87,7 @@ network_paths:
backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv"
outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers"
manual_outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/JC_test_construction_file.csv"
construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_file.csv"
construction_file_path_ni: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_ni_file.csv"
# construction_add_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_additions_2023-11-06_v5.csv" # TODO Need to test
# construction_amend_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_amendments_2023-10-31_v2.csv" # TODO Need to test
Expand Down
15 changes: 10 additions & 5 deletions src/imputation/MoR.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
calculate_totals,
)


good_statuses = ["Clear", "Clear - overridden"]
bad_statuses = ["Form sent out", "Check needed"]

Expand Down Expand Up @@ -112,6 +113,8 @@ def carry_forwards(df, backdata, impute_vars):
df = pd.merge(
df, backdata, how="left", on="reference", suffixes=("", "_prev"), indicator=True
)
# ensure the instance columns are still type "int" after merge
df = df.astype({"instance": "Int64", "instance_prev": "Int64"})

# keep only the rows needed, see function docstring for details.
no_match_cond = df["_merge"] == "left_only"
Expand All @@ -124,15 +127,17 @@ def carry_forwards(df, backdata, impute_vars):
# Copy values from relevant columns where references match
match_cond = df["_merge"] == "both"

# replace the values of certain columns with the values from the back data
# TODO: Check with methodology or BAU as to which other cols to take from backdata
# TODO: By default, columns not updated such as 4xx, 5xx will contain the current
# data, instance 0.
# Replace the values of certain columns with the values from the back data
replace_vars = ["instance", "200", "201", "601", "602", "604"]
for var in replace_vars:
df.loc[match_cond, var] = df.loc[match_cond, f"{var}_prev"]

# Update the varibles to be imputed by the corresponding previous values, filling
# nulls with zeros.
for var in impute_vars:
df.loc[match_cond, f"{var}_imputed"] = df.loc[match_cond, f"{var}_prev"]
df.loc[match_cond, f"{var}_imputed"] = df.loc[
match_cond, f"{var}_prev"
].fillna(0)
df.loc[match_cond, "imp_marker"] = "CF"

df.loc[match_cond] = create_imp_class_col(df, "200_prev", "201_prev")
Expand Down
7 changes: 4 additions & 3 deletions src/imputation/imputation_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,14 @@ def run_imputation(
# Changing all records that meet the criteria to "604" == "Yes"
imputed_df.loc[(chk_mask & imputation_mask), "604"] = "Yes"

# Run short form expansion
imputed_df = run_sf_expansion(imputed_df, config)

# join constructed rows back to the imputed df
# Note that constructed rows need to be included in short form expansion
if "is_constructed" in df.columns:
imputed_df = pd.concat([imputed_df, constructed_df])

# Run short form expansion
imputed_df = run_sf_expansion(imputed_df, config)

# join manually trimmed columns back to the imputed df
if not trimmed_df.empty:
imputed_df = pd.concat([imputed_df, trimmed_df])
Expand Down
35 changes: 30 additions & 5 deletions src/imputation/sf_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,19 +165,44 @@ def apply_expansion(
return expanded_df


def prepare_short_form_constructed(df: pd.DataFrame, master_cols: List) -> pd.DataFrame:
"""Prepare the constructed short form responses for sf expansion.

The constructed records were removed from imputation, so it is necessary to copy
the master columns to the empty "imputed" master columns.
It is also necessary to create imputation classes for these records.

For example, column "211" needs to be copied to "211_imputed" in these cases.
"""
sf_constructed_mask = (df.formtype == "0006") & (df.imp_marker == "constructed")

# Create imputation class for the short
df.loc[sf_constructed_mask, "imp_class"] = (
df.loc[sf_constructed_mask, "200"] + "_" + df.loc[sf_constructed_mask, "201"]
)

# Copy the values of the master columns to the corresponding "_imputed" column
for col in master_cols:
df.loc[sf_constructed_mask, f"{col}_imputed"] = df.loc[sf_constructed_mask, col]

return df


@df_change_func_wrap
def run_sf_expansion(df: pd.DataFrame, config: dict) -> pd.DataFrame:
"""Calculate the expansion imputated values for short forms using long form data."""

# Remove records that have the reference list variables
# and those that have "nan" in the imp class
filtered_df, excluded_df = split_df_on_imp_class(df)

# Get dictionary of short form master keys (or target variables)
# and breakdown variables
breakdown_dict = config["breakdowns"]
master_values = list(breakdown_dict)

# Prepare constructed short-form entries for sf expansion imputation
df = prepare_short_form_constructed(df, master_values)

# Remove records that have the reference list variables
# and those that have "nan" in the imp class
filtered_df, excluded_df = split_df_on_imp_class(df)

# Obtain the "threshold_num" from the config
# (this is the minimum viable number in an imputation class)
threshold_num = config["imputation"]["sf_expansion_threshold"]
Expand Down
Loading