ONSdigital · AnneONS · Jan 16, 2024 · Jan 15, 2024 · Jan 15, 2024 · Jan 16, 2024
@@ -1 +1 @@
-__version__ = "0.6.1"
+__version__ = "0.6.2"
@@ -11,7 +11,7 @@ global:
   load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
   load_ni_data: False
   load_historic_data: False
-  run_construction: False
+  run_construction: True
   run_ni_construction: False
   load_manual_outliers: False
   load_manual_imputation: False
@@ -87,7 +87,7 @@ network_paths:
   backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv"
   outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers"
   manual_outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
-  construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/JC_test_construction_file.csv"
+  construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_file.csv"
   construction_file_path_ni: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_ni_file.csv"
   # construction_add_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_additions_2023-11-06_v5.csv" # TODO Need to test
   # construction_amend_path:  "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_amendments_2023-10-31_v2.csv" # TODO Need to test

@@ -10,6 +10,7 @@
     calculate_totals,
 )
 
+
 good_statuses = ["Clear", "Clear - overridden"]
 bad_statuses = ["Form sent out", "Check needed"]
 
@@ -112,6 +113,8 @@ def carry_forwards(df, backdata, impute_vars):
     df = pd.merge(
         df, backdata, how="left", on="reference", suffixes=("", "_prev"), indicator=True
     )
+    # ensure the instance columns are still type "int" after merge
+    df = df.astype({"instance": "Int64", "instance_prev": "Int64"})
 
     # keep only the rows needed, see function docstring for details.
     no_match_cond = df["_merge"] == "left_only"
@@ -124,15 +127,17 @@ def carry_forwards(df, backdata, impute_vars):
     # Copy values from relevant columns where references match
     match_cond = df["_merge"] == "both"
 
-    # replace the values of certain columns with the values from the back data
-    # TODO: Check with methodology or BAU as to which other cols to take from backdata
-    # TODO: By default, columns not updated such as 4xx, 5xx will contain the current
-    # data, instance 0.
+    # Replace the values of certain columns with the values from the back data
     replace_vars = ["instance", "200", "201", "601", "602", "604"]
     for var in replace_vars:
         df.loc[match_cond, var] = df.loc[match_cond, f"{var}_prev"]
+
+    # Update the varibles to be imputed by the corresponding previous values, filling 
+    # nulls with zeros.
     for var in impute_vars:
-        df.loc[match_cond, f"{var}_imputed"] = df.loc[match_cond, f"{var}_prev"]
+        df.loc[match_cond, f"{var}_imputed"] = df.loc[
+            match_cond, f"{var}_prev"
+        ].fillna(0)
     df.loc[match_cond, "imp_marker"] = "CF"
 
     df.loc[match_cond] = create_imp_class_col(df, "200_prev", "201_prev")

@@ -125,13 +125,14 @@ def run_imputation(
     # Changing all records that meet the criteria to "604" == "Yes"
     imputed_df.loc[(chk_mask & imputation_mask), "604"] = "Yes"
 
-    # Run short form expansion
-    imputed_df = run_sf_expansion(imputed_df, config)
-
     # join constructed rows back to the imputed df
+    # Note that constructed rows need to be included in short form expansion
     if "is_constructed" in df.columns:
         imputed_df = pd.concat([imputed_df, constructed_df])
 
+    # Run short form expansion
+    imputed_df = run_sf_expansion(imputed_df, config)
+
     # join manually trimmed columns back to the imputed df
     if not trimmed_df.empty:
         imputed_df = pd.concat([imputed_df, trimmed_df])

@@ -165,19 +165,44 @@ def apply_expansion(
     return expanded_df
 
 
+def prepare_short_form_constructed(df: pd.DataFrame, master_cols: List) -> pd.DataFrame:
+    """Prepare the constructed short form responses for sf expansion.
+
+    The constructed records were removed from imputation, so it is necessary to copy
+    the master columns to the empty "imputed" master columns.
+    It is also necessary to create imputation classes for these records.
+
+    For example, column "211" needs to be copied to "211_imputed" in these cases.
+    """
+    sf_constructed_mask = (df.formtype == "0006") & (df.imp_marker == "constructed")
+
+    # Create imputation class for the short
+    df.loc[sf_constructed_mask, "imp_class"] = (
+        df.loc[sf_constructed_mask, "200"] + "_" + df.loc[sf_constructed_mask, "201"]
+    )
+
+    # Copy the values of the master columns to the corresponding "_imputed" column
+    for col in master_cols:
+        df.loc[sf_constructed_mask, f"{col}_imputed"] = df.loc[sf_constructed_mask, col]
+
+    return df
+
+
 @df_change_func_wrap
 def run_sf_expansion(df: pd.DataFrame, config: dict) -> pd.DataFrame:
     """Calculate the expansion imputated values for short forms using long form data."""
-
-    # Remove records that have the reference list variables
-    # and those that have "nan" in the imp class
-    filtered_df, excluded_df = split_df_on_imp_class(df)
-
     # Get dictionary of short form master keys (or target variables)
     # and breakdown variables
     breakdown_dict = config["breakdowns"]
     master_values = list(breakdown_dict)
 
+    # Prepare constructed short-form entries for sf expansion imputation
+    df = prepare_short_form_constructed(df, master_values)
+
+    # Remove records that have the reference list variables
+    # and those that have "nan" in the imp class
+    filtered_df, excluded_df = split_df_on_imp_class(df)
+
     # Obtain the "threshold_num" from the config
     # (this is the minimum viable number in an imputation class)
     threshold_num = config["imputation"]["sf_expansion_threshold"]