From c9bf7abe6a9eb0d367dd95be6cc9ce41e81f10df Mon Sep 17 00:00:00 2001
From: Tom Coates <tom.coates@ons.gov.uk>
Date: Fri, 5 Jan 2024 15:40:27 +0000
Subject: [PATCH 01/26] fix inconsistency in naming of short/long form

---
 ...{frozen_longform_schema.toml => long_form_schema.toml} | 0
 ...rozen_shortform_schema.toml => short_form_schema.toml} | 0
 src/developer_config.yaml                                 | 8 ++++----
 src/outputs/long_form.py                                  | 2 +-
 src/outputs/short_form.py                                 | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)
 rename config/output_schemas/{frozen_longform_schema.toml => long_form_schema.toml} (100%)
 rename config/output_schemas/{frozen_shortform_schema.toml => short_form_schema.toml} (100%)

diff --git a/config/output_schemas/frozen_longform_schema.toml b/config/output_schemas/long_form_schema.toml
similarity index 100%
rename from config/output_schemas/frozen_longform_schema.toml
rename to config/output_schemas/long_form_schema.toml
diff --git a/config/output_schemas/frozen_shortform_schema.toml b/config/output_schemas/short_form_schema.toml
similarity index 100%
rename from config/output_schemas/frozen_shortform_schema.toml
rename to config/output_schemas/short_form_schema.toml
diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index 843c341d7..2c9677731 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -67,8 +67,8 @@ hdfs_paths:
   manual_imp_trim_path: "/ons/rdbe_dev/imputation/manual_trimming"
   outliers_path: "/ons/rdbe_dev/outliers"
   estimation_path:  "/ons/rdbe_dev/estimation"
-  short_form_schema: "src/outputs/output_schemas/frozen_shortform_schema.toml"
-  long_form_schema: "src/outputs/output_schemas/frozen_longform_schema.toml"
+  short_form_schema: "src/outputs/output_schemas/short_form_schema.toml"
+  long_form_schema: "src/outputs/output_schemas/long_form_schema.toml"
   export_path: /ons/rdbe_dev/outgoing_export
   feather_path: "/ons/rdbe_dev/staging/feather"
 network_paths:
@@ -113,8 +113,8 @@ network_paths:
   civil_defence_detailed_path: "R:/BERD Results System Development 2023/DAP_emulation/mappers/civil_defence_detailed.csv"
   sic_division_detailed_path: "R:/BERD Results System Development 2023/DAP_emulation/mappers/sic_div_detailed.csv"
 schema_paths:
-  frozen_shortform_schema: "config/output_schemas/frozen_shortform_schema.toml"
-  frozen_longform_schema: "config/output_schemas/frozen_longform_schema.toml"
+  short_form_schema: "config/output_schemas/short_form_schema.toml"
+  long_form_schema: "config/output_schemas/long_form_schema.toml"
   tau_schema: "config/output_schemas/tau_schema.toml"
   gb_sas_schema: "config/output_schemas/gb_sas_schema.toml"
   ni_sas_schema: "config/output_schemas/ni_sas_schema.toml"
diff --git a/src/outputs/long_form.py b/src/outputs/long_form.py
index bdc75bdfc..250e0ca2c 100644
--- a/src/outputs/long_form.py
+++ b/src/outputs/long_form.py
@@ -49,7 +49,7 @@ def output_long_form(
     df = map_o.join_fgn_ownership(df, ultfoc_mapper)
 
     # Create long form output dataframe with required columns from schema
-    schema_path = config["schema_paths"]["frozen_longform_schema"]
+    schema_path = config["schema_paths"]["long_form_schema"]
     schema_dict = load_schema(schema_path)
     longform_output = create_output_df(df, schema_dict)
 
diff --git a/src/outputs/short_form.py b/src/outputs/short_form.py
index eaf5516ef..57ad5c667 100644
--- a/src/outputs/short_form.py
+++ b/src/outputs/short_form.py
@@ -136,7 +136,7 @@ def output_short_form(
     df = run_shortform_prep(df, round_val=4)
 
     # Create short form output dataframe with required columns from schema
-    schema_path = config["schema_paths"]["frozen_shortform_schema"]
+    schema_path = config["schema_paths"]["short_form_schema"]
     schema_dict = load_schema(schema_path)
     shortform_output = create_output_df(df, schema_dict)
 

From 93a1d89710c33a055f8aec56b68f1dc79f949f90 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Fri, 5 Jan 2024 17:46:05 +0000
Subject: [PATCH 02/26] script to create
 C:/Users/griffa1/Anaconda3/envs/resdev362/python.exe
 d:/coding_projects/research-and-development/unit_test_helper.py

---
 unit_test_helper.py | 60 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 unit_test_helper.py

diff --git a/unit_test_helper.py b/unit_test_helper.py
new file mode 100644
index 000000000..5e481a4bc
--- /dev/null
+++ b/unit_test_helper.py
@@ -0,0 +1,60 @@
+"""Read in a csv file and ouput a test file with data for a unit test."""
+import pandas as pd
+import os
+
+# configuration settings
+csv_path = "D:/coding_projects/randd_test_data/"
+input_file = "outlier_test1.csv"
+
+# whether the unit test data is input or expected output
+in_or_output = "input"
+
+output_filename = f"new_{in_or_output}_function"
+
+# read in the csv
+path1 = os.path.join(csv_path, input_file)
+df1 = pd.read_csv(path1)
+
+
+# set all datatypes to string - we are outputting all the data as a string
+df1 = df1.astype(str)
+
+# add quotes to the strings in the columns that should show as string types
+string_cols = ["period"]
+
+df1[string_cols] = df1[string_cols].applymap('"{}"'.format)
+
+# prepare the output formatting
+tab = " "*4
+
+col_list = df1.columns
+col_string = ""
+
+# create a new column that joins the contents of the other columns
+df1['output'] = f"{tab}["
+for col in df1.columns[:-1]:
+    df1["output"] += df1[col] + ", "
+    col_string += f'{tab}{tab}"{col}",\n'
+    
+df1['output'] += df1[df1.columns[-2]] + "],"
+
+# concatenate everything in the new column into a single string
+rows_string = df1["output"].str.cat(sep=f"\n{tab}")
+
+# join all the components into a final string for output
+full_text = f'''def create_input_df(self):
+    """Create an input dataframe for the test."""
+    {in_or_output}_columns = [\n{col_string}{tab}]
+        
+    data = [\n{tab}{rows_string}]   
+
+    {in_or_output}_df = pandasDF(data=data, columns={in_or_output}_columns)
+    return {in_or_output}_df
+    '''
+
+# write the prepared text to a txt file
+out_path = os.path.join(csv_path, output_filename + ".txt")
+
+text_file = open(out_path, "w")
+text_file.write(full_text)
+text_file.close()

From b72d7ec8fde0c67c554528260435bb1f518338ff Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Fri, 5 Jan 2024 17:48:34 +0000
Subject: [PATCH 03/26] script to create unit test dataframe from csv

---
 unit_test_helper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/unit_test_helper.py b/unit_test_helper.py
index 5e481a4bc..fccfc59c7 100644
--- a/unit_test_helper.py
+++ b/unit_test_helper.py
@@ -9,7 +9,7 @@
 # whether the unit test data is input or expected output
 in_or_output = "input"
 
-output_filename = f"new_{in_or_output}_function"
+output_filename = f"{in_or_output}_function"
 
 # read in the csv
 path1 = os.path.join(csv_path, input_file)
@@ -46,7 +46,7 @@
     """Create an input dataframe for the test."""
     {in_or_output}_columns = [\n{col_string}{tab}]
         
-    data = [\n{tab}{rows_string}]   
+    data = [\n{tab}{rows_string}\n{tab}]   
 
     {in_or_output}_df = pandasDF(data=data, columns={in_or_output}_columns)
     return {in_or_output}_df

From 43f2841c85173f180516a807c30f27b26e6beb9b Mon Sep 17 00:00:00 2001
From: George Zorinyants <george.zorinyants@ons.gov.uk>
Date: Tue, 9 Jan 2024 10:05:09 +0000
Subject: [PATCH 04/26] Changed postcode column from postcodes_harmonised to
 601

---
 src/site_apportionment/site_apportionment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/site_apportionment/site_apportionment.py b/src/site_apportionment/site_apportionment.py
index 2b5bec401..8fff9f701 100644
--- a/src/site_apportionment/site_apportionment.py
+++ b/src/site_apportionment/site_apportionment.py
@@ -12,7 +12,7 @@
 ins = "instance"
 period = "period"
 form = "formtype"
-postcode = "postcodes_harmonised"
+postcode = "601" # "postcodes_harmonised"
 percent = "602"
 product = "201"
 pg_num = "pg_numeric"

From 6b57fba3453f279bd96318ad939c6df8e02a04d3 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Tue, 9 Jan 2024 10:15:37 +0000
Subject: [PATCH 05/26] remove duplicate output_imputation

---
 src/developer_config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index e07316c20..6432ea285 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -27,7 +27,6 @@ global:
   output_auto_outliers: False
   output_outlier_qa : False
   output_estimation_qa: False
-  output_imputation_qa: False
   output_apportionment_qa: False
   output_long_form: False
   output_short_form: False

From 8dc14abbfdc4859865591483a4489edd0c5a6093 Mon Sep 17 00:00:00 2001
From: George Zorinyants <george.zorinyants@ons.gov.uk>
Date: Tue, 9 Jan 2024 14:25:18 +0000
Subject: [PATCH 06/26] Postcode topup is applied to 601 in validation

---
 src/staging/validation.py | 78 +++++++--------------------------------
 1 file changed, 14 insertions(+), 64 deletions(-)

diff --git a/src/staging/validation.py b/src/staging/validation.py
index 8b257cc69..417a28116 100644
--- a/src/staging/validation.py
+++ b/src/staging/validation.py
@@ -185,6 +185,7 @@ def validate_post_col(
     )
 
     df["postcodes_harmonised"] = df["postcodes_harmonised"].apply(postcode_topup)
+    df["601"] = df["601"].apply(postcode_topup)
 
     ValidationLogger.info("All postcodes validated....")
 
@@ -333,9 +334,6 @@ def load_schema(file_path: str = "./config/contributors_schema.toml") -> dict:
         toml_dict = toml.load(file_path)
     else:
         # Return False if file does not exist
-        ValidationLogger.warning(
-            "Validation schema does not exist! Path may be incorrect"
-        )
         return file_exists
 
     return toml_dict
@@ -418,9 +416,6 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str):
     # Load schema from toml
     dtypes_schema = load_schema(schema_path)
 
-    if not dtypes_schema:
-        raise FileNotFoundError(f"File at {schema_path} does not exist. Check path")
-
     # Create a dict for dtypes only
     dtypes_dict = {
         column_nm: dtypes_schema[column_nm]["Deduced_Data_Type"]
@@ -447,15 +442,6 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str):
                 survey_df[column] = survey_df[column].astype(pd.Int64Dtype())
             elif dtypes_dict[column] == "str":
                 survey_df[column] = survey_df[column].astype("string")
-            elif "datetime" in dtypes_dict[column]:
-                try:
-                    survey_df[column] = pd.to_datetime(
-                        survey_df[column], errors="coerce"
-                    )
-                except TypeError:
-                    raise TypeError(
-                        f"Failed to convert column '{column}' to datetime. Please check the data."
-                    )
             else:
                 survey_df[column] = survey_df[column].astype(dtypes_dict[column])
             ValidationLogger.debug(f"{column} after: {survey_df[column].dtype}")
@@ -565,31 +551,22 @@ def check_ultfoc(value):
 
 @time_logger_wrap
 @exception_wrap
-def validate_many_to_one(*args) -> pd.DataFrame:
+def validate_many_to_one(
+    mapper: pd.DataFrame, col_many: str, col_one: str
+) -> pd.DataFrame:
     """
-    Validates a many-to-one mapper DataFrame.
 
-    This function performs the following checks:
-    1. Checks if the mapper has two specified columns, referred to as 'col_many' and 'col_one'.
-    2. Selects and deduplicates 'col_many' and 'col_one'.
-    3. Checks that for each entry in 'col_many' there is exactly one corresponding entry in 'col_one'.
+    Validates a many to one mapper:
+    1. Checks if the mapper has two columns col_many and col_one.
+    2. Salects and deduplicates col_many and col_one.
+    3. Checks that for each entry in col_many there is exactly one entry in
+    col_one.
 
     Args:
-        *args: Variable length argument list. It should contain the following items in order:
-            - df (pd.DataFrame): The input mapper DataFrame.
-            - col_many (str): The name of the column with many entries.
-            - col_one (str): The name of the column with one entry.
-
-    Returns:
-        pd.DataFrame: The validated mapper DataFrame with deduplicated 'col_many' and 'col_one' columns.
-
-    Raises:
-        ValueError: If the mapper does not have the 'col_many' and 'col_one' columns, or if there are multiple entries in 'col_one' for any entry in 'col_many'.
+        df (pd.DataFrame): The input mapper
+        col_many (str): name of the column with many entries
+        col_one (str): name of the column with one entry
     """
-
-    mapper = args[0]
-    col_many = args[1]
-    col_one = args[2]
     try:
         # Check that expected column are present
         cols = mapper.columns
@@ -611,7 +588,7 @@ def validate_many_to_one(*args) -> pd.DataFrame:
             ValidationLogger.info(
                 "The following codes have multile mapping: \n {df_bad}"
             )
-            raise ValueError("Mapper is many to many")
+            raise ValueError(f"Mapper is many to many")
         return df
 
     except ValueError as ve:
@@ -648,7 +625,7 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame:
         df["contents_check"] = status_check & from_status_check
 
         # Check if there are any False values in the "contents_check" column
-        if (df["contents_check"] == False).any():  # noqa
+        if (df["contents_check"] == False).any():
             raise ValueError("Unexpected format within column contents")
 
         # Drop the "contents_check" column
@@ -658,30 +635,3 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame:
 
     except ValueError as ve:
         raise ValueError("cora status mapper validation failed: " + str(ve))
-
-
-def flag_no_rand_spenders(df, raise_or_warn):
-    """
-    Flags any records that answer "No" to "604" and also report their expenditure in "211" as more than 0.
-
-    Parameters:
-    df (pandas.DataFrame): The input DataFrame.
-
-    Returns:
-        None
-    """
-    invalid_records = df.loc[(df["604"] == "No") & (df["211"] > 0)]
-
-    if not invalid_records.empty:
-        if raise_or_warn == "raise":
-            raise Exception("Some records report no R&D, but spend in 211 > 0.")
-        elif raise_or_warn == "warn":
-            total_invalid_spend = invalid_records["211"].sum()
-            ValidationLogger.error("Some records report no R&D, but spend in 211 > 0.")
-            ValidationLogger.error(
-                f"The total spend of 'No' R&D companies is £{int(total_invalid_spend)}"
-            )
-            ValidationLogger.error(invalid_records)
-
-    else:
-        ValidationLogger.debug("All records have valid R&D spend.")

From c215a842b259c763ea86ed18e265dba52587c316 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Thu, 11 Jan 2024 10:13:06 +0000
Subject: [PATCH 07/26] move the removal of filter qa to imputation

---
 src/developer_config.yaml              |  1 -
 src/imputation/imputation_helpers.py   | 54 ++++++++++++++++++++++++--
 src/imputation/imputation_main.py      | 19 ++++-----
 src/outlier_detection/auto_outliers.py |  8 ----
 src/outputs/form_output_prep.py        | 30 ++------------
 src/outputs/outputs_main.py            | 15 +------
 6 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index e07316c20..6432ea285 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -27,7 +27,6 @@ global:
   output_auto_outliers: False
   output_outlier_qa : False
   output_estimation_qa: False
-  output_imputation_qa: False
   output_apportionment_qa: False
   output_long_form: False
   output_short_form: False
diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py
index db4707e48..f12dffdc5 100644
--- a/src/imputation/imputation_helpers.py
+++ b/src/imputation/imputation_helpers.py
@@ -1,11 +1,11 @@
 """Utility functions  to be used in the imputation module."""
 import logging
-
-from typing import List
 import pandas as pd
-
+from typing import List, Dict, Callable
 from itertools import chain
 
+from src.outputs.status_filtered import output_status_filtered
+
 ImputationHelpersLogger = logging.getLogger(__name__)
 
 
@@ -219,3 +219,51 @@ def fill_sf_zeros(df: pd.DataFrame) -> pd.DataFrame:
         df.loc[(sf_mask & clear_mask), q] = df.copy()[q].fillna(0)
 
     return df
+
+
+def tidy_imputation_dataframe(
+        df: pd.DataFrame,
+        config: Dict,
+        logger,
+        to_impute_cols: List,
+        write_csv: Callable,
+        run_id: int,
+        ) -> pd.DataFrame:
+    """Remove rows and columns not needed after imputation."""
+    # Create lists for the qa cols
+    imp_cols = [f"{col}_imputed" for col in to_impute_cols]
+
+    # Update the original breakdown questions and target variables with the imputed
+    df[to_impute_cols] = df[imp_cols]
+
+    # Remove all qa columns
+    to_drop = [
+        col
+        for col in df.columns
+        if (col.endswith("prev") | col.endswith("imputed") | col.endswith("link"))
+    ]
+    df = df.drop(columns=to_drop)
+
+    # Keep only clear and imputed records
+    imputed_statuses = ["TMI", "CF", "MoR", "constructed"]
+    to_keep = df["imp_marker"].isin(imputed_statuses) | (df["imp_marker"] == "R")
+
+    to_keep_df = df.copy().loc[to_keep]
+    filtered_output_df = df.copy().loc[~to_keep]
+
+    # change the value of the status column to 'imputed' for imputed statuses
+    condition = to_keep_df["status"].isin(imputed_statuses)
+    to_keep_df.loc[condition, "status"] = "imputed"
+
+    # Running status filtered full dataframe output for QA
+    if config["global"]["output_status_filtered"]:
+        logger.info("Starting status filtered output...")
+        output_status_filtered(
+            filtered_output_df,
+            config,
+            write_csv,
+            run_id,
+        )
+        logger.info("Finished status filtered output.")
+
+    return to_keep_df
diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py
index a023f982c..a08c6365c 100644
--- a/src/imputation/imputation_main.py
+++ b/src/imputation/imputation_main.py
@@ -141,13 +141,14 @@ def run_imputation(
 
     ImputationMainLogger.info("Finished Imputation calculation.")
 
-    # Create names for imputed cols
-    imp_cols = [f"{col}_imputed" for col in to_impute_cols]
-
-    # Update the original breakdown questions and target variables with the imputed
-    imputed_df[to_impute_cols] = imputed_df[imp_cols]
-
-    # Drop imputed values from df
-    imputed_df = imputed_df.drop(columns=imp_cols)
-
+    # remove rows and columns no longer needed from the imputed dataframe
+    imputed_df = hlp.tidy_imputation_dataframe(
+        imputed_df,
+        config,
+        ImputationMainLogger,
+        to_impute_cols,
+        write_csv,
+        run_id, 
+    )
+    
     return imputed_df
diff --git a/src/outlier_detection/auto_outliers.py b/src/outlier_detection/auto_outliers.py
index 26eae3583..b3eaa3918 100644
--- a/src/outlier_detection/auto_outliers.py
+++ b/src/outlier_detection/auto_outliers.py
@@ -238,14 +238,6 @@ def run_auto_flagging(
 
     # loop through all columns to be flagged for outliers
     for value_col in flag_value_cols:
-        # to_numeric is needed to convert strings. However 'coerce'
-        # means values that
-        # can't be converted are represented by NaNs.
-        # TODO data validation and cleaning should replace the need for
-        # 'to_numeric'
-        # check ticket (RDRP-386)
-        df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
-
         # Call function to add a flag for auto outliers in column value_col
         df = flag_outliers(df, upper_clip, lower_clip, value_col)
 
diff --git a/src/outputs/form_output_prep.py b/src/outputs/form_output_prep.py
index 0e3898484..6ef352657 100644
--- a/src/outputs/form_output_prep.py
+++ b/src/outputs/form_output_prep.py
@@ -28,26 +28,13 @@ def form_output_prep(
         filtered_output_df (pd.DataFrame): data noot used in outputs
 
     """
-
-    imputed_statuses = ["TMI", "CF", "MoR", "constructed"]
-
-    to_keep = estimated_df["imp_marker"].isin(imputed_statuses) | (
-        estimated_df["imp_marker"] == "R"
-    )
-
     # Deal with "No" in 604, also eliminating spenders
     flag_no_rand_spenders(estimated_df, "error")
     no_rnd_spenders_filter = ~(
         (estimated_df["604"] == "No") & (estimated_df["211"] > 0)
     )
-    estimated_df = estimated_df.copy().loc[no_rnd_spenders_filter]
-
-    # filter estimated_df and weighted_df to only include clear or imputed statuses
-    outputs_df = estimated_df.copy().loc[to_keep]
-    tau_outputs_df = weighted_df.copy().loc[to_keep]
-
-    # filter estimated_df for records not included in outputs
-    filtered_output_df = estimated_df.copy().loc[~to_keep]
+    outputs_df = estimated_df.copy().loc[no_rnd_spenders_filter]
+    tau_outputs_df = weighted_df.copy().loc[no_rnd_spenders_filter]
 
     if ni_full_responses is not None:
         # Add required columns to NI data
@@ -66,19 +53,10 @@ def form_output_prep(
         # outputs_df = pd.concat([outputs_df, ni_full_responses])
         tau_outputs_df = pd.concat([tau_outputs_df, ni_full_responses])
 
-        # change the value of the status column to 'imputed' for imputed statuses
-        condition = outputs_df["status"].isin(imputed_statuses)
-        outputs_df.loc[condition, "status"] = "imputed"
-
-        return ni_full_responses, outputs_df, tau_outputs_df, filtered_output_df
+        return ni_full_responses, outputs_df, tau_outputs_df
 
     else:
-
-        # change the value of the status column to 'imputed' for imputed statuses
-        condition = outputs_df["status"].isin(imputed_statuses)
-        outputs_df.loc[condition, "status"] = "imputed"
-
         # create an empty ni_responses dataframe
         ni_full_responses = pd.DataFrame()
 
-        return ni_full_responses, outputs_df, tau_outputs_df, filtered_output_df
+        return ni_full_responses, outputs_df, tau_outputs_df
diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py
index c61280772..dc93b367d 100644
--- a/src/outputs/outputs_main.py
+++ b/src/outputs/outputs_main.py
@@ -4,7 +4,6 @@
 from typing import Callable, Dict, Any
 
 from src.outputs.form_output_prep import form_output_prep
-from src.outputs.status_filtered import output_status_filtered
 from src.outputs.short_form import output_short_form
 from src.outputs.long_form import output_long_form
 from src.outputs.tau import output_tau
@@ -64,8 +63,7 @@ def run_outputs(
     (
         ni_full_responses,
         outputs_df,
-        tau_outputs_df,
-        filtered_output_df,
+        tau_outputs_df
     ) = form_output_prep(
         estimated_df,
         weighted_df,
@@ -74,17 +72,6 @@ def run_outputs(
         sic_pg_alpha,
     )
 
-    # Running status filtered full dataframe output for QA
-    if config["global"]["output_status_filtered"]:
-        OutputMainLogger.info("Starting status filtered output...")
-        output_status_filtered(
-            filtered_output_df,
-            config,
-            write_csv,
-            run_id,
-        )
-        OutputMainLogger.info("Finished status filtered output.")
-
     # Running short form output
     if config["global"]["output_short_form"]:
         OutputMainLogger.info("Starting short form output...")

From 2a42dcb0922eeb2a6cde4c2c08990821d642c309 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Thu, 11 Jan 2024 10:36:00 +0000
Subject: [PATCH 08/26] updated the filtered_qa_schema toml

---
 .../status_filtered_qa_schema.toml            | 280 ------------------
 src/imputation/imputation_helpers.py          |   4 +
 2 files changed, 4 insertions(+), 280 deletions(-)

diff --git a/config/output_schemas/status_filtered_qa_schema.toml b/config/output_schemas/status_filtered_qa_schema.toml
index 148a8cb81..2e520fe95 100644
--- a/config/output_schemas/status_filtered_qa_schema.toml
+++ b/config/output_schemas/status_filtered_qa_schema.toml
@@ -646,286 +646,6 @@ Deduced_Data_Type = "object"
 old_name = "imp_class"
 Deduced_Data_Type = "object"
 
-[202_prev]
-old_name = "202_prev"
-Deduced_Data_Type = "float64"
-
-[203_prev]
-old_name = "203_prev"
-Deduced_Data_Type = "float64"
-
-[204_prev]
-old_name = "204_prev"
-Deduced_Data_Type = "float64"
-
-[205_prev]
-old_name = "205_prev"
-Deduced_Data_Type = "float64"
-
-[206_prev]
-old_name = "206_prev"
-Deduced_Data_Type = "float64"
-
-[207_prev]
-old_name = "207_prev"
-Deduced_Data_Type = "float64"
-
-[209_prev]
-old_name = "209_prev"
-Deduced_Data_Type = "float64"
-
-[210_prev]
-old_name = "210_prev"
-Deduced_Data_Type = "float64"
-
-[211_prev]
-old_name = "211_prev"
-Deduced_Data_Type = "float64"
-
-[212_prev]
-old_name = "212_prev"
-Deduced_Data_Type = "float64"
-
-[214_prev]
-old_name = "214_prev"
-Deduced_Data_Type = "float64"
-
-[216_prev]
-old_name = "216_prev"
-Deduced_Data_Type = "float64"
-
-[218_prev]
-old_name = "218_prev"
-Deduced_Data_Type = "float64"
-
-[219_prev]
-old_name = "219_prev"
-Deduced_Data_Type = "float64"
-
-[220_prev]
-old_name = "220_prev"
-Deduced_Data_Type = "float64"
-
-[221_prev]
-old_name = "221_prev"
-Deduced_Data_Type = "float64"
-
-[222_prev]
-old_name = "222_prev"
-Deduced_Data_Type = "float64"
-
-[223_prev]
-old_name = "223_prev"
-Deduced_Data_Type = "float64"
-
-[225_prev]
-old_name = "225_prev"
-Deduced_Data_Type = "float64"
-
-[226_prev]
-old_name = "226_prev"
-Deduced_Data_Type = "float64"
-
-[227_prev]
-old_name = "227_prev"
-Deduced_Data_Type = "float64"
-
-[228_prev]
-old_name = "228_prev"
-Deduced_Data_Type = "float64"
-
-[229_prev]
-old_name = "229_prev"
-Deduced_Data_Type = "float64"
-
-[237_prev]
-old_name = "237_prev"
-Deduced_Data_Type = "float64"
-
-[242_prev]
-old_name = "242_prev"
-Deduced_Data_Type = "float64"
-
-[243_prev]
-old_name = "243_prev"
-Deduced_Data_Type = "float64"
-
-[244_prev]
-old_name = "244_prev"
-Deduced_Data_Type = "float64"
-
-[245_prev]
-old_name = "245_prev"
-Deduced_Data_Type = "float64"
-
-[246_prev]
-old_name = "246_prev"
-Deduced_Data_Type = "float64"
-
-[247_prev]
-old_name = "247_prev"
-Deduced_Data_Type = "float64"
-
-[248_prev]
-old_name = "248_prev"
-Deduced_Data_Type = "float64"
-
-[249_prev]
-old_name = "249_prev"
-Deduced_Data_Type = "float64"
-
-[250_prev]
-old_name = "250_prev"
-Deduced_Data_Type = "float64"
-
-[302_prev]
-old_name = "302_prev"
-Deduced_Data_Type = "float64"
-
-[303_prev]
-old_name = "303_prev"
-Deduced_Data_Type = "float64"
-
-[304_prev]
-old_name = "304_prev"
-Deduced_Data_Type = "float64"
-
-[305_prev]
-old_name = "305_prev"
-Deduced_Data_Type = "float64"
-
-[emp_researcher_prev]
-old_name = "emp_researcher_prev"
-Deduced_Data_Type = "float64"
-
-[emp_technician_prev]
-old_name = "emp_technician_prev"
-Deduced_Data_Type = "float64"
-
-[emp_other_prev]
-old_name = "emp_other_prev"
-Deduced_Data_Type = "float64"
-
-[emp_total_prev]
-old_name = "emp_total_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_res_m_prev]
-old_name = "headcount_res_m_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_res_f_prev]
-old_name = "headcount_res_f_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_tec_m_prev]
-old_name = "headcount_tec_m_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_tec_f_prev]
-old_name = "headcount_tec_f_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_oth_m_prev]
-old_name = "headcount_oth_m_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_oth_f_prev]
-old_name = "headcount_oth_f_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_tot_m_prev]
-old_name = "headcount_tot_m_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_tot_f_prev]
-old_name = "headcount_tot_f_prev"
-Deduced_Data_Type = "float64"
-
-[headcount_total_prev]
-old_name = "headcount_total_prev"
-Deduced_Data_Type = "float64"
-
-[211_link]
-old_name = "211_link"
-Deduced_Data_Type = "float64"
-
-[305_link]
-old_name = "305_link"
-Deduced_Data_Type = "float64"
-
-[emp_researcher_link]
-old_name = "emp_researcher_link"
-Deduced_Data_Type = "float64"
-
-[emp_technician_link]
-old_name = "emp_technician_link"
-Deduced_Data_Type = "float64"
-
-[emp_other_link]
-old_name = "emp_other_link"
-Deduced_Data_Type = "float64"
-
-[headcount_res_m_link]
-old_name = "headcount_res_m_link"
-Deduced_Data_Type = "float64"
-
-[headcount_res_f_link]
-old_name = "headcount_res_f_link"
-Deduced_Data_Type = "float64"
-
-[headcount_tec_m_link]
-old_name = "headcount_tec_m_link"
-Deduced_Data_Type = "float64"
-
-[headcount_tec_f_link]
-old_name = "headcount_tec_f_link"
-Deduced_Data_Type = "float64"
-
-[headcount_oth_m_link]
-old_name = "headcount_oth_m_link"
-Deduced_Data_Type = "float64"
-
-[headcount_oth_f_link]
-old_name = "headcount_oth_f_link"
-Deduced_Data_Type = "float64"
-
-[200_original]
-old_name = "200_original"
-Deduced_Data_Type = "float64"
-
-[pg_sic_class]
-old_name = "pg_sic_class"
-Deduced_Data_Type = "object"
-
-[empty_pgsic_group]
-old_name = "empty_pgsic_group"
-Deduced_Data_Type = "object"
-
-[empty_pg_group]
-old_name = "empty_pg_group"
-Deduced_Data_Type = "object"
-
-[200_imp_marker]
-old_name = "200_imp_marker"
-Deduced_Data_Type = "object"
-
-[211_trim]
-old_name = "211_trim"
-Deduced_Data_Type = "object"
-
-[305_trim]
-old_name = "305_trim"
-Deduced_Data_Type = "object"
-
-[manual_trim]
-old_name = "manual_trim"
-Deduced_Data_Type = "object"
-
-[sf_expansion_grouping]
-old_name = "sf_expansion_grouping"
-Deduced_Data_Type = "object"
-
 [auto_outlier]
 old_name = "auto_outlier"
 Deduced_Data_Type = "bool"
diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py
index f12dffdc5..f91f646b5 100644
--- a/src/imputation/imputation_helpers.py
+++ b/src/imputation/imputation_helpers.py
@@ -242,6 +242,10 @@ def tidy_imputation_dataframe(
         for col in df.columns
         if (col.endswith("prev") | col.endswith("imputed") | col.endswith("link"))
     ]
+
+    to_drop += ["200_original", "pg_sic_class", "empty_pgsic_group", "empty_pg_group"]
+    to_drop += ["200_imp_marker", "211_trim", "305_trim", "manual_trim"]
+    to_drop += ["sf_expansion_grouping"]
     df = df.drop(columns=to_drop)
 
     # Keep only clear and imputed records

From 638e5c153876cbfc65fcde55f8d15c4e970e2704 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Thu, 11 Jan 2024 11:02:18 +0000
Subject: [PATCH 09/26] correct the cols to be dropped at the end of imputation

---
 src/imputation/imputation.py             |  581 -----------
 src/imputation/imputation_helpers.py     |    1 -
 tests/test_imputation/test_imputation.py | 1173 ----------------------
 3 files changed, 1755 deletions(-)
 delete mode 100644 src/imputation/imputation.py
 delete mode 100644 tests/test_imputation/test_imputation.py

diff --git a/src/imputation/imputation.py b/src/imputation/imputation.py
deleted file mode 100644
index a8c2c8c60..000000000
--- a/src/imputation/imputation.py
+++ /dev/null
@@ -1,581 +0,0 @@
-import pandas as pd
-import numpy as np
-import logging
-
-# TODO almost each could be further generalised in terms of
-# variable and function names
-
-ImputationLogger = logging.getLogger(__name__)
-
-
-def filter_by_column_content(
-    raw_df: pd.DataFrame, column: str, column_content: str
-) -> pd.DataFrame:
-    """Filter a column for specific string content.
-
-    Args:
-        raw_df (pd.DataFrame): The dataframe to be filtered.
-        column (str): The name of the column to be filtered.
-        column_content (str): The content to be filtered on.
-
-    Returns:
-        pd.DataFrame: The filtered dataframe.
-    """
-    # filter for rows with column_content
-    clean_df = raw_df[raw_df[column] == column_content].copy()
-
-    return clean_df
-
-
-def rename_imp_col(clean_df: pd.DataFrame):
-    """
-    This function renames columns in dataframe, replacing civ_or_def with 200
-    and Product_group with 201 if they are present.
-
-    Args:
-        clean_df (pd.DataFrame): Input Dataframe to rename columns.
-
-    Returns:
-        pd.Dataframe: returns dataframe with renamed columns.
-    """
-    if "civ_or_def" in clean_df.columns:
-        clean_df = clean_df.rename(columns={"civ_or_def": "200"})
-
-    if "Product_group" in clean_df.columns:
-        clean_df = clean_df.rename(columns={"Product_group": "201"})
-
-    return clean_df
-
-
-def create_imp_class_col(
-    clean_df: pd.DataFrame, col_first_half: str, col_second_half: str, class_name: str
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        clean_df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-
-    # TODO remove when using real data
-    clean_df[f"{col_second_half}"] = clean_df[f"{col_second_half}"].astype(str)
-
-    # Create class col with concatenation
-    clean_df[f"{class_name}"] = (
-        clean_df[f"{col_first_half}"] + "_" + clean_df[f"{col_second_half}"]
-    )
-
-    return clean_df
-
-
-def filter_same_class(
-    clean_df: pd.DataFrame, current_period: str, previous_period: str
-) -> pd.DataFrame:
-    """_summary_
-    Args:
-        clean_df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-
-    # Filter for cols with same contents
-    clean_same_class_df = clean_df[
-        clean_df[f"{current_period}_class"] == clean_df[f"{previous_period}_class"]
-    ].copy()
-
-    return clean_same_class_df
-
-
-def filter_pairs(
-    clean_same_class_df: pd.DataFrame,
-    target_variable: str,
-    current_period: str,
-    previous_period: str,
-) -> pd.DataFrame:
-    """_summary_ Checks two columns have same contents
-
-    Args:
-        clean_same_class_df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-    # TODO needs more tweeks but essentially same as
-    # filter_same_class but for target var not class
-    matched_pairs_df = clean_same_class_df[
-        (clean_same_class_df[f"{current_period}_{target_variable}_status"] == "Present")
-        & (
-            clean_same_class_df[f"{previous_period}_{target_variable}_status"]
-            == "Present"
-        )
-    ].copy()
-
-    return matched_pairs_df
-
-
-def flag_nulls_and_zeros(
-    target_variables_list: list,
-    df: pd.DataFrame,
-    curr_q: str,
-    prev_q: str,
-):
-    """Flag target variables containing nulls or zreos.
-
-    A new column {var}_valid is created for each var in the target variables.
-    This is flagged with 1 if either the current period or previous period
-    contains either a null or a zero. Otherwise, the flag is 0.
-
-    Args:
-        target_variables (list of str): the target variables
-        df (pd.DataFrame): dataframe with current and previous periods
-        curr_q (str): the current period
-        prev_q (str): the previous period
-
-    Returns:
-        pd.DataFrame - a dataframe indicating nulls and zeros in target cols.
-    """
-    df = df.copy()
-    for var in target_variables_list:
-        cond1 = (df[f"{curr_q}_{var}"].isnull()) | (df[f"{prev_q}_{var}"].isnull())
-        cond2 = (df[f"{curr_q}_{var}"] == 0) | (df[f"{prev_q}_{var}"] == 0)
-        df[f"{var}_valid"] = np.where(cond1 | cond2, False, True)
-
-    return df
-
-
-def calc_growth_ratio(
-    target_variable: str,
-    df: pd.DataFrame,
-    current_period: int,
-    previous_period: int,
-) -> pd.DataFrame:
-    """Calculate the growth ratio for imputation.
-
-    For the current target_variable, a growth_ratio column is created.
-    A growth rate is calculated for those rows where the "target_value_valid"
-    is true, meaning that there are no nulls or zeros in the previous or
-    current periods, TODO and the status is a 'responder' status.
-
-    If this condition is not met, the row has a null value in this column.
-
-    Args:
-        target_variable (str): The column name of the target variable.
-        df (pd.DataFrame): The dataframe containing the target variables.
-        current_period
-
-    Returns:
-        pd.DataFrame
-    """
-    flagged_df = flag_nulls_and_zeros(
-        [target_variable], df, current_period, previous_period
-    )
-
-    responder_statuses = ["Clear", "Clear - overridden", "Clear - overridden SE"]
-
-    cond1 = flagged_df[f"{target_variable}_valid"]
-    cond2 = flagged_df["status"].isin(responder_statuses)
-
-    flagged_df[f"{target_variable}_growth_ratio"] = np.where(
-        cond1 & cond2,
-        (
-            df[f"{current_period}_{target_variable}"]
-            / df[f"{previous_period}_{target_variable}"]
-        ),
-        np.nan,
-    )
-    df = flagged_df.drop(columns=[f"{target_variable}_valid"])
-
-    return df
-
-
-def sort_df(target_variable: str, df: pd.DataFrame) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        target_variable (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-    # import ipdb
-
-    # ipdb.set_trace()
-    # sorted based on hard coded list (in arg by=)
-    sorted_df = df.sort_values(
-        by=[
-            "200",
-            "201",
-            f"{target_variable}_growth_ratio",
-            "employees",
-            "reference",
-        ],
-        ascending=[True, True, True, False, True],
-    )
-    sorted_df.reset_index(drop=True, inplace=True)
-
-    return sorted_df
-
-
-def trim_check(
-    df: pd.DataFrame, check_value=10
-) -> pd.DataFrame:  # TODO add check_value to a cofig
-    """_summary_
-
-    Args:
-        df (pd.DataFrame, check_value, optional): _description_
-        Defaults to 10)->pd.DataFrame(.
-
-    Returns:
-        _type_: _description_
-    """
-    # tag for those classes with more than check_value (currently 10)
-    if len(df) <= check_value:  # TODO or is this just <
-        df["trim_check"] = "below_trim_threshold"
-    else:
-        df["trim_check"] = "above_trim_threshold"
-
-    return df
-
-
-def trim_bounds(
-    df: pd.DataFrame,
-    lower_perc=15,  # TODO add percentages to config -
-    # check method inBERD_imputation_spec_V3
-    upper_perc=15,
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        df (pd.DataFrame, lower_perc, optional): _description_.
-        Defaults to 15, TODO add percentages to config
-
-    Returns:
-        _type_: _description_
-    """
-    # trim only if more than 10
-    df = filter_by_column_content(df, "trim_check", "above_trim_threshold")
-    df.reset_index(drop=True, inplace=True)
-
-    # define the bounds for trimming
-    remove_lower = np.ceil(len(df) * (lower_perc / 100))
-    remove_upper = np.ceil(len(df) * (1 - upper_perc / 100))
-
-    # create trim tag (distinct from trim_check)
-    # to mark which to trim for mean growth ratio
-    df["trim"] = "do trim"
-    df.loc[
-        remove_lower : remove_upper - 2, "trim"
-    ] = "dont trim"  # TODO check if needs to be inclusive of exlusive
-
-    return df
-
-
-def get_mean_growth_ratio(
-    df: pd.DataFrame,
-    dict_mean_growth_ratio: dict,  # TODO maybe rename to more decriptive name
-    unique_item: str,
-    target_variable: str,
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        dict_mean_growth_ratio (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-    """Including the count of matched pairs
-for each imputed variable and imputation
-class in the output would be helpful for
-the RAP team and MQD to determine the
-quality of the imputed value. """
-
-    # remove the "trim" tagged rows
-    df_trimmed = filter_by_column_content(df, "trim", "dont trim")
-
-    dict_mean_growth_ratio[
-        f"{unique_item}_{target_variable}_mean_growth_ratio and count"
-    ] = [
-        df_trimmed[f"{target_variable}_growth_ratio"].mean(),
-        len(df_trimmed),
-    ]  # TODO check same len(df[f'{target_variable}_growth_ratio'] and len(df)
-    # Also add to a dataframe:
-    # df[f'{target_variable}_mean_growth_ratio'] = \
-    # df[f'{target_variable}_growth_ratio'].mean()
-
-    return dict_mean_growth_ratio  # TODO aka "imputation links"
-    # what naming is best?
-
-
-def loop_unique(
-    df: pd.DataFrame,  # TODO think of a better name for function
-    column: str,
-    target_variables_list: list,
-    current_period: str,
-    previous_period: str,
-    dict_mean_growth_ratio={},
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-    # will be looping over the class col
-    # dict_mean_growth_ratio = {}  # TODO change to dict at the end
-    # growth_ratio_dfs_list = []
-    # for subsets of class and then on target variable at a time
-    # growht ratio in calculated, data is sorted, trim check done,
-    # trim bounds calculated and labelled then mean growth ratio
-    # calculated and stored in a dictionary
-    for unique_item in df[column].unique():
-        unique_item_df = df[df[column] == unique_item].copy()
-        for target_variable in target_variables_list:
-            growth_ratio_df = calc_growth_ratio(
-                target_variable, unique_item_df, current_period, previous_period
-            )
-            sorted_df = sort_df(target_variable, growth_ratio_df)
-            trim_check_df = trim_check(sorted_df)
-            trimmed_df = trim_bounds(trim_check_df)
-
-            dict_mean_growth_ratio = get_mean_growth_ratio(
-                trimmed_df, dict_mean_growth_ratio, unique_item, target_variable
-            )
-            # growth_ratio_dfs_list.append(growth_ratio_df)
-            # could also store in a df?
-
-    # growth_ratio_df = pd.concat(growth_ratio_dfs_list)
-    # could also store ina dataframe
-
-    return dict_mean_growth_ratio  # , growth_ratio_df
-    # aka "imputation links" - what naming is best?
-
-
-# TODO break this function into smaller functions
-def forward_imputation(
-    df: pd.DataFrame,
-    column: str,
-    target_variables_list: list,
-    current_period: str,
-    previous_period: str,
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-
-    df_growth_ratio = df[~df.isin([np.nan]).any(axis=1)].copy()
-    # df_growth_ratio = df[
-    #     df[f"{current_period}_var1"] != "missing"
-    # ].copy()  # TODO add f string
-
-    dict_mean_growth_ratio = loop_unique(
-        df_growth_ratio,
-        column,
-        target_variables_list,
-        current_period,
-        previous_period,
-    )
-
-    dfs_list = []
-    df_final = df.copy()
-    for class_name in df_final[f"{current_period}_class"].unique():
-        for var in target_variables_list:
-            df_other = df_final[
-                df_final[f"{current_period}_class"] == class_name
-            ].copy()
-            df_other = df_other[
-                df_other[f"{current_period}_{var}"].isnull()
-            ].copy()  # change the name of df_final and df_other
-
-            df_other[f"{class_name}_{var}_growth_ratio"] = dict_mean_growth_ratio[
-                f"{class_name}_{var}_mean_growth_ratio and count"
-            ][0]
-            df_other[f"forwards_imputed_{var}"] = round(
-                df_other[f"{class_name}_{var}_growth_ratio"]
-                * df_other[f"{previous_period}_{var}"]
-            ).astype("Int64")
-
-            df_other = df_other.drop(columns=[f"{class_name}_{var}_growth_ratio"])
-            dfs_list.append(df_other)
-
-    df_out = pd.concat(dfs_list)
-
-    return df_out
-
-
-# TODO break this function into smaller functions
-def backwards_imputation(
-    df: pd.DataFrame,
-    column: str,
-    target_variables_list: list,
-    current_period: str,
-    previous_period: str,
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-
-    df_growth_ratio = df[~df.isin([np.nan]).any(axis=1)].copy()
-    # df_growth_ratio = df[
-    #     df[f"{previous_period}_var1"] != "missing"
-    # ].copy()  # TODO add f string
-
-    dict_mean_growth_ratio = loop_unique(
-        df_growth_ratio,
-        column,
-        target_variables_list,
-        current_period,
-        previous_period,
-    )
-
-    dfs_list = []
-    df_final = df.copy()
-    for class_name in df_final[f"{current_period}_class"].unique():
-        for var in target_variables_list:
-            df_other = df_final[
-                df_final[f"{current_period}_class"] == class_name
-            ].copy()
-            df_other = df_other[
-                df_other[f"{previous_period}_{var}"].isnull()
-            ].copy()  # TODO change the name of df_final and df_other
-            # TODO add f string to previous_period_var1
-            df_other[f"{class_name}_{var}_growth_ratio"] = dict_mean_growth_ratio[
-                f"{class_name}_{var}_mean_growth_ratio and count"
-            ][0]
-            df_other[f"backwards_imputed_{var}"] = round(
-                df_other[f"{current_period}_{var}"]
-                / df_other[f"{class_name}_{var}_growth_ratio"]
-            ).astype("Int64")
-            df_other = df_other.drop(columns=[f"{class_name}_{var}_growth_ratio"])
-            dfs_list.append(df_other)
-
-    df_out = pd.concat(dfs_list)
-
-    return df_out
-
-
-def run_imputation(
-    # full_responses: pd.DataFrame,  # df = full_responses.copy()
-    # column: str,
-    test_df,
-    target_variables_list: list,
-    current_period: str,
-    previous_period: str,
-) -> pd.DataFrame:
-    """_summary_
-
-    Args:
-        df (_type_): _description_
-
-    Returns:
-        _type_: _description_
-    """
-
-    # replacing civ_or_def with 200 and Product_group with 201
-    test_df = rename_imp_col(test_df)
-
-    # q200 is Business or business R&D type
-    # q201 is Product Group
-    clean_df = create_imp_class_col(test_df, "200", "201", f"{current_period}_class")
-    clean_df.reset_index(drop=True, inplace=True)
-
-    # TODO:flag_nulls_and_zeros() could can optionally be run to output a QA csv
-    # indicating where there are nulls and zeros in the target variables
-    # flagged_df = flag_nulls_and_zeros(
-    #     target_variables_list, clean_df, current_period, previous_period
-    # )
-
-    forward_df = forward_imputation(
-        clean_df,
-        f"{current_period}_class",
-        target_variables_list,
-        current_period,
-        previous_period,
-    )
-
-    backwards_df = backwards_imputation(
-        clean_df,
-        f"{current_period}_class",
-        target_variables_list,
-        current_period,
-        previous_period,
-    )
-
-    return forward_df, backwards_df
-
-
-def update_imputed(
-    full_resp_df,
-    imputed_vals_df,
-    target_variables_list,
-    imputation_direction,
-    ref_col="reference",
-) -> pd.DataFrame:
-    """Updates missing response data with imputed values for target variables
-
-    Keyword Arguments:
-        full_resp_df -- DataFrame of the response data
-        imputed_vals_df -- DataFrame contining imputed values calculated in
-        imputation module
-        target_variables_list -- list of variable that need imputed if no
-        response
-        imputation_direction -- can be either "forwards" or "backwards" depending on
-        whether current or previous period has no response
-
-    Returns:
-        full_resp_df: DataFrame with missing exchanged for imputed values
-        for target variables
-    """
-
-    # Validate the input dataframes checking for columns
-    if not all(
-        col in full_resp_df.columns for col in [ref_col] + target_variables_list
-    ):
-        ImputationLogger.debug("There are some cols missing in full responses.")
-        raise ValueError("One or more columns are missing in full_resp_df")
-
-    if not all(
-        col in imputed_vals_df.columns
-        for col in [ref_col]
-        + [f"{imputation_direction}_imputed_{col}" for col in target_variables_list]
-    ):
-        ImputationLogger.debug("There are some cols missing in imputed_vals_df.")
-        raise ValueError("One or more columns are missing in imputed_vals_df")
-
-    # add imputed tag column
-    full_resp_df["imputation_marker"] = "response"
-    imputed_vals_df["imputation_marker"] = f"{imputation_direction}_imputed"
-
-    # exchange reference col for index
-    # in preparation for update function
-    full_resp_df.index = full_resp_df[ref_col]
-    imputed_vals_df.index = imputed_vals_df[ref_col]
-
-    # rename cols in preparation for update function
-    for col in target_variables_list:
-        imputed_vals_df = imputed_vals_df.rename(
-            columns={f"{imputation_direction}_imputed_{col}": col}
-        )
-
-    # apply update - changes input_full inplace
-    full_resp_df.update(imputed_vals_df)
-
-    # change index back to normal
-    full_resp_df = full_resp_df.reset_index(drop=True)
-
-    return full_resp_df
diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py
index f91f646b5..0e6622a41 100644
--- a/src/imputation/imputation_helpers.py
+++ b/src/imputation/imputation_helpers.py
@@ -245,7 +245,6 @@ def tidy_imputation_dataframe(
 
     to_drop += ["200_original", "pg_sic_class", "empty_pgsic_group", "empty_pg_group"]
     to_drop += ["200_imp_marker", "211_trim", "305_trim", "manual_trim"]
-    to_drop += ["sf_expansion_grouping"]
     df = df.drop(columns=to_drop)
 
     # Keep only clear and imputed records
diff --git a/tests/test_imputation/test_imputation.py b/tests/test_imputation/test_imputation.py
deleted file mode 100644
index 024411750..000000000
--- a/tests/test_imputation/test_imputation.py
+++ /dev/null
@@ -1,1173 +0,0 @@
-import numpy as np
-import pandas as pd
-from pandas._testing import assert_frame_equal
-from pandas import DataFrame as pandasDF
-
-from src.imputation.imputation import (
-    update_imputed,
-    run_imputation,
-    backwards_imputation,
-    forward_imputation,
-    loop_unique,
-    get_mean_growth_ratio,
-    trim_bounds,
-    trim_check,
-    calc_growth_ratio,
-    sort_df,
-    filter_by_column_content,
-    create_imp_class_col,
-    filter_same_class,
-    filter_pairs,
-    flag_nulls_and_zeros,
-)
-
-
-class TestCleanData:  # usetag
-    """Unit test for filter_by_column_content"""
-
-    def input_data_filter_by_column_content(self):
-        """Create input data for the filter_by_column_content function"""
-
-        # columns for the dataframe
-        input_cols = ["clean_check"]
-
-        # data in the column order above
-        input_data = [["clean"], ["not_clean"]]
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_filter_by_column_content(self):
-        """Create output data for the filter_by_column_content function"""
-
-        # columns for the dataframe
-        output_cols = ["clean_check"]
-
-        # data in the column order above
-        output_data = [["clean"]]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def test_filter_by_column_content(self):
-        """Test the expected functionality"""
-
-        df_input = self.input_data_filter_by_column_content()
-        df_expout = self.output_data_filter_by_column_content()
-        column = "clean_check"
-        column_content = "clean"
-        df_result = filter_by_column_content(
-            df_input, column, column_content
-        )  # add period filter functionality
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestCreateClassCol:
-    """Unit test for create_imp_class_col"""
-
-    def input_data_create_imp_class_col(self):
-        """Create input data for the create_imp_class_col function"""
-
-        input_cols = ["200", "201"]
-
-        input_data = [["C", "AG"]]
-
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_create_imp_class_col(self):
-        """Create output data for the create_imp_class_col function"""
-
-        output_cols = ["200", "201", "class"]
-
-        output_data = [["C", "AG", "C_AG"]]
-
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def test_create_imp_class_col(self):
-        """Test the expected functionality"""
-
-        df_input = self.input_data_create_imp_class_col()
-        df_expout = self.output_data_create_imp_class_col()
-
-        col_first_half = "200"
-        col_second_half = "201"
-        class_name = "class"
-
-        df_result = create_imp_class_col(
-            df_input, col_first_half, col_second_half, class_name
-        )  # add period filter functionality
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestFilterSameClass:
-    """Unit test for filter_same_class"""
-
-    def input_data_filter_same_class(self):
-        """Create input data for the filter_same_class function"""
-
-        # columns for the dataframe
-        input_cols = ["company_ref", "190012_class", "190009_class"]
-
-        # data in the column order above
-        input_data = [
-            [1, "class1", "class1"],
-            [10, "class1", "class2"],
-            [20, "class2", "class1"],
-        ]
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_filter_same_class(self):
-        """Create output data for the filter_same_class function"""
-
-        # columns for the dataframe
-        output_cols = ["company_ref", "190012_class", "190009_class"]
-
-        # data in the column order above
-        output_data = [[1, "class1", "class1"]]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def test_filter_same_class(self):
-        """Test the expected functionality"""
-
-        df_input = self.input_data_filter_same_class()
-        df_expout = self.output_data_filter_same_class()
-
-        current_period = "190012"
-        previous_period = "190009"
-
-        df_result = filter_same_class(df_input, current_period, previous_period)
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestFilterPairs:
-    """Unit test for filter_pairs"""
-
-    def input_data_filter_pairs(self):
-        """Create input data for the filter_pairs function"""
-
-        # columns for the dataframe
-        input_cols = ["company_ref", "190012_target_status", "190009_target_status"]
-
-        # data in the column order above
-        input_data = [
-            [1, "Present", "Present"],
-            [10, "Missing", "Present"],
-            [20, "Present", "Missing"],
-        ]
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_filter_pairs(self):
-        """Create output data for the filter_pairs function"""
-
-        # columns for the dataframe
-        output_cols = ["company_ref", "190012_target_status", "190009_target_status"]
-
-        # data in the column order above
-        output_data = [[1, "Present", "Present"]]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def test_filter_pairs(self):
-        """Test the expected functionality"""
-
-        df_input = self.input_data_filter_pairs()
-        df_expout = self.output_data_filter_pairs()
-
-        target_variable = "target"
-        current_period = "190012"
-        previous_period = "190009"
-
-        df_result = filter_pairs(
-            df_input, target_variable, current_period, previous_period
-        )  # add period filter functionality
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestFlagNullsZeros:
-    """Unit tests for flag_nulls_zeros."""
-
-    def input_data(self):
-        """Create dataframe for input data."""
-        input_schema = {
-            "ref": "Int64",
-            "curr_var1": "Int64",
-            "prev_var1": "Int64",
-            "curr_var2": "Int64",
-            "prev_var2": "Int64",
-        }
-
-        input_data = [
-            [1, 100, np.nan, 0, 201],
-            [2, 100, 101, 200, 201],
-            [3, np.nan, 101, 200, 201],
-            [4, 100, 101, 200, 201],
-            [5, 100, np.nan, 200, 201],
-            [6, 100, 101, 200, 201],
-            [7, 100, 101, 0, 201],
-            [8, 100, 101, 200, 0],
-        ]
-
-        input_df = pandasDF(data=input_data, columns=input_schema.keys()).astype(
-            input_schema
-        )
-
-        return input_df
-
-    def output_data(self):
-        """Create dataframe for output data."""
-        out_schema = {
-            "ref": "Int64",
-            "curr_var1": "Int64",
-            "prev_var1": "Int64",
-            "curr_var2": "Int64",
-            "prev_var2": "Int64",
-            "var1_valid": "Bool",
-            "var2_valid": "Bool",
-        }
-
-        output_data = [
-            [1, 100, np.nan, 0, 201, False, False],
-            [2, 100, 101, 200, 201, True, True],
-            [3, np.nan, 101, 200, 201, False, True],
-            [4, 100, 101, 200, 201, True, True],
-            [5, 100, np.nan, 200, 201, False, True],
-            [6, 100, 101, 200, 201, True, True],
-            [7, 100, 101, 0, 201, True, False],
-            [8, 100, 101, 200, 0, True, False],
-        ]
-
-        output_df = pandasDF(data=output_data, columns=out_schema.keys()).astype(
-            out_schema
-        )
-
-        return output_df
-
-    def test_flag_nulls_and_zeros(self):
-        """Unit test for flag_nulls_and_zeros."""
-        df_expout = self.output_data()
-        input_df = self.input_data()
-        df_result = flag_nulls_and_zeros(["var1", "var2"], input_df, "curr", "prev")
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestCalcGrowthRatio:
-    """Unit test for calc_growth_ratio"""
-
-    def input_data_calc_growth_ratio(self):
-        """Create input data for the calc_growth_ratio function"""
-
-        input_cols = {
-            "status": "str",
-            "current_var1": "Int64",
-            "previous_var1": "Int64",
-            "current_var2": "Int64",
-            "previous_var2": "Int64",
-        }
-
-        input_data = [
-            ["Clear", 2, 8, 2, 4],
-            ["Clear", 3, 6, 2, np.nan],
-            ["Clear", np.nan, 8, np.nan, 4],
-            ["Clear", 2, 1, 2, 4],
-            ["Form sent out", 5, 3, 2, 4],
-        ]
-
-        input_df = pandasDF(data=input_data, columns=input_cols.keys()).astype(
-            input_cols
-        )
-
-        return input_df
-
-    def output_data_calc_growth_ratio(
-        self,
-    ):  # 'Imputed(Fwd)','Imputed(Bwd)', 'ACTUAL', 'Const(Prog)'
-        """Create output data for the calc_growth_ratio function"""
-
-        output_cols = {
-            "status": "str",
-            "current_var1": "Int64",
-            "previous_var1": "Int64",
-            "current_var2": "Int64",
-            "previous_var2": "Int64",
-            "var1_growth_ratio": "float",
-        }
-
-        output_data = [
-            ["Clear", 2, 8, 2, 4, 0.25],
-            ["Clear", 3, 6, 2, np.nan, 0.5],
-            ["Clear", np.nan, 8, np.nan, 4, np.nan],
-            ["Clear", 2, 1, 2, 4, 2.0],
-            ["Form sent out", 5, 3, 2, 4, np.nan],
-        ]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols.keys()).astype(
-            output_cols
-        )
-
-        return df_expout
-
-    def test_calc_growth_ratio(self):
-        """Test the expected functionality"""
-
-        target_variable = "var1"
-        input_df = self.input_data_calc_growth_ratio()
-        df_expout = self.output_data_calc_growth_ratio()
-        current_period = "current"
-        previous_period = "previous"
-
-        print(input_df, "\n", df_expout)
-        df_result = calc_growth_ratio(
-            target_variable, input_df, current_period, previous_period
-        )
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestSortDf:
-    """Unit test for sort_df"""
-
-    def input_data_sort_df(self):
-        """Create input data for the sort_df function"""
-
-        # columns for the dataframe
-        input_cols = [
-            "200",
-            "201",
-            "var1_growth_ratio",
-            "employees",
-            "reference",
-        ]
-
-        # data in the column order above
-        input_data = [
-            [3, 1, 1, 1, 1],
-            [2, 1, 1, 1, 1],
-            [1, 1, 1, 1, 1],
-            [3, 1, 1, 2, 1],
-            [2, 1, 1, 2, 1],
-            [1, 1, 1, 2, 1],
-        ]
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_sort_df(self):
-        """Create output data for the sort_df function"""
-
-        # columns for the dataframe
-        output_cols = [
-            "200",
-            "201",
-            "var1_growth_ratio",
-            "employees",
-            "reference",
-        ]
-
-        # data in the column order above
-        output_data = [
-            [1, 1, 1, 2, 1],
-            [1, 1, 1, 1, 1],
-            [2, 1, 1, 2, 1],
-            [2, 1, 1, 1, 1],
-            [3, 1, 1, 2, 1],
-            [3, 1, 1, 1, 1],
-        ]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def test_sort_df(self):
-        """Test the expected functionality"""
-
-        df_input = self.input_data_sort_df()
-        df_expout = self.output_data_sort_df()
-        target_variable = "var1"
-
-        df_result = sort_df(target_variable, df_input)
-        assert_frame_equal(df_result, df_expout)
-
-
-class TestTrimCheck:
-    """Unit test for trim_check"""
-
-    def input_data_trim_check_less_than_10(self):
-        """Create input data for the trim_check function"""
-
-        # columns for the dataframe
-        input_cols = ["col1", "col2"]
-
-        # data in the column order above
-        input_data = [
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-        ]  # 9 rows (less than 10)
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def input_data_trim_check_equal_10(self):
-        """Create input data for the trim_check function"""
-
-        # columns for the dataframe
-        input_cols = ["col1", "col2"]
-
-        # data in the column order above
-        input_data = [
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-        ]  # 10 rows (==10)
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def input_data_trim_check_more_than_10(self):
-        """Create input data for the trim_check function"""
-
-        # columns for the dataframe
-        input_cols = ["col1", "col2"]
-
-        # data in the column order above
-        input_data = [
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-            [1, 1],
-        ]  # 11 rows (more than 10)
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_trim_check_less_than_10(self):
-        """Create output data for the trim_check function"""
-
-        # columns for the dataframe
-        output_cols = ["col1", "col2", "trim_check"]
-
-        # data in the column order above
-        output_data = [
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-        ]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def output_data_trim_check_equal_10(self):
-        """Create output data for the trim_check function"""
-
-        # columns for the dataframe
-        output_cols = ["col1", "col2", "trim_check"]
-
-        # data in the column order above
-        output_data = [
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-            [1, 1, "below_trim_threshold"],
-        ]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def output_data_trim_check_more_than_10(self):
-        """Create output data for the trim_check function"""
-
-        # columns for the dataframe
-        output_cols = ["col1", "col2", "trim_check"]
-
-        # data in the column order above
-        output_data = [
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-            [1, 1, "above_trim_threshold"],
-        ]
-
-        # Create a pandas dataframe
-        df_expout = pandasDF(data=output_data, columns=output_cols)
-
-        return df_expout
-
-    def test_trim_check(self):
-        """Test the expected functionality"""
-
-        df_input_less_than_10 = self.input_data_trim_check_less_than_10()
-        df_input_equal_10 = self.input_data_trim_check_equal_10()
-        df_input_more_than_10 = self.input_data_trim_check_more_than_10()
-
-        df_expout_less_than_10 = self.output_data_trim_check_less_than_10()
-        df_expout_equal_10 = self.output_data_trim_check_equal_10()
-        df_expout_more_than_10 = self.output_data_trim_check_more_than_10()
-
-        df_result_less_than_10 = trim_check(df_input_less_than_10)
-        df_result_equal_10 = trim_check(df_input_equal_10)
-        df_result_more_than_10 = trim_check(df_input_more_than_10)
-
-        assert_frame_equal(df_expout_less_than_10, df_result_less_than_10)
-        assert_frame_equal(df_expout_equal_10, df_result_equal_10)
-        assert_frame_equal(df_expout_more_than_10, df_result_more_than_10)
-
-
-class TestTrimBounds:
-    """Unit test for trim_bounds"""
-
-    def input_data_trim_bounds(self):
-        """Create input data for the trim_bounds function"""
-
-        # columns for the dataframe
-        input_cols = ["col1", "col2", "trim_check"]
-
-        # data in the column order above
-        input_data = [
-            [1, 1, "above_trim_threshold"],
-            [2, 1, "above_trim_threshold"],
-            [3, 1, "above_trim_threshold"],
-            [4, 1, "above_trim_threshold"],
-            [5, 1, "above_trim_threshold"],
-        ]
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_trim_bounds(self):
-        """Create output data for the trim_bounds function"""
-
-        # columns for the dataframe
-        output_cols = ["col1", "col2", "trim_check", "trim"]
-
-        # data in the column order above
-        output_data = [
-            [1, 1, "above_trim_threshold", "do trim"],
-            [2, 1, "above_trim_threshold", "dont trim"],
-            [3, 1, "above_trim_threshold", "dont trim"],
-            [4, 1, "above_trim_threshold", "dont trim"],
-            [5, 1, "above_trim_threshold", "do trim"],
-        ]  # ! would I want to remove 4th and 5 or just 4th
-
-        # Create a pandas dataframe
-        output_df = pandasDF(data=output_data, columns=output_cols)
-
-        return output_df
-
-    def test_trim_bounds(self):
-        """Test the expected functionality"""
-
-        input_df = self.input_data_trim_bounds()
-        expout_df = self.output_data_trim_bounds()
-
-        df_result = trim_bounds(input_df)  # add period filter functionality
-        assert_frame_equal(df_result, expout_df)
-
-
-class TestGetMeanGrowthRatio:
-    """Unit test for get_mean_growth_ratio"""
-
-    def input_data_get_mean_growth_ratio(self):
-        """Create input data for the get_mean_growth_ratio function"""
-
-        # columns for the dataframe
-        input_cols = ["var1_growth_ratio", "trim"]
-
-        # data in the column order above
-        input_data = [
-            [1, "dont trim"],
-            [2, "dont trim"],
-            [3, "dont trim"],
-            [4, "dont trim"],
-            [5, "dont trim"],
-        ]
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_get_mean_growth_ratio(self):
-        """Create output data for the get_mean_growth_ratio function"""
-
-        # output dict
-        output_dict = {"class1_var1_mean_growth_ratio and count": [3.0, 5]}
-
-        return output_dict
-
-    def test_get_mean_growth_ratio(self):
-        """Test the expected functionality"""
-
-        input_df = self.input_data_get_mean_growth_ratio()
-        expout_dict = self.output_data_get_mean_growth_ratio()
-        # expout_df = self.output_data_get_mean_growth_ratio_df()
-
-        result_dict = get_mean_growth_ratio(
-            input_df, {}, "class1", "var1"
-        )  # add period filter functionality
-        assert result_dict == expout_dict
-        # assert_frame_equal(results_df, expout_df)
-
-
-class TestLoopUnique:  # testing for loops run as expected
-    """Unit test for loop_unique"""
-
-    def input_data_loop_unique(self):
-        """Create input data for the loop_unique function"""
-
-        # columns for the dataframe
-        input_cols = [
-            "status",
-            "current_period_class",
-            "200",
-            "201",
-            "current_period_var1",
-            "current_period_var2",
-            "previous_period_var1",
-            "previous_period_var2",
-            "employees",
-            "reference",
-            "trim",
-        ]
-
-        # data in the column order above
-        input_data = [
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"],
-        ]  # (more than 10 rows per class)
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols)
-
-        return input_df
-
-    def output_data_loop_unique(self):
-        """Create output data for the loop_unique function"""
-
-        # output dict
-        output_dict = {
-            "class1_var1_mean_growth_ratio and count": [1.0, 7],
-            "class1_var2_mean_growth_ratio and count": [2.0, 7],
-            "class2_var1_mean_growth_ratio and count": [3.0, 7],
-            "class2_var2_mean_growth_ratio and count": [4.0, 7],
-        }
-
-        return output_dict
-
-    def test_loop_unique(self):
-        """Test the expected functionality"""
-
-        input_df = self.input_data_loop_unique()
-        expout_dict = self.output_data_loop_unique()
-        # expout_df = self.output_data_loop_unique_df()
-
-        column = "current_period_class"
-        target_variables_list = ["var1", "var2"]
-        current_period = "current_period"
-        previous_period = "previous_period"
-
-        result_dict = loop_unique(
-            input_df,  # removed , result_df
-            column,
-            target_variables_list,
-            current_period,
-            previous_period,
-        )
-        assert result_dict == expout_dict
-        # assert_frame_equal(result_df, expout_df)
-
-
-class TestForwardImputation:
-    """Unit test for forward_imputation"""
-
-    def input_data_forward_imputation(self):
-        """Create input data for the forward_imputation function"""
-
-        input_cols = {
-            "status": "str",
-            "current_period_class": "str",
-            "200": "str",
-            "201": "str",
-            "current_period_var1": "Int64",
-            "previous_period_var1": "Int64",
-            "employees": "Int64",
-            "reference": "Int64",
-            "trim": "str",
-        }
-
-        input_data = [
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", np.nan, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", np.nan, 1, 1, 1, "dont trim"],
-        ]  # (more than 10 rows per class)
-
-        input_df = pandasDF(data=input_data, columns=input_cols.keys())
-        input_df = input_df.astype(input_cols)
-
-        return input_df
-
-    def output_data_forward_imputation(self):
-        """Create output data for the forward_imputation function"""
-
-        output_cols = {
-            "status": "str",
-            "current_period_class": "str",
-            "200": "str",
-            "201": "str",
-            "current_period_var1": "Int64",
-            "previous_period_var1": "Int64",
-            "employees": "Int64",
-            "reference": "Int64",
-            "trim": "str",
-            "forwards_imputed_var1": "Int64",
-        }
-
-        output_data = [
-            ["Clear", "class1", "C", "G", np.nan, 1, 1, 1, "dont trim", 4],
-            ["Clear", "class2", "D", "G", np.nan, 1, 1, 1, "dont trim", 6],
-        ]  # (more than 10 rows per class)
-
-        output_df = pandasDF(
-            data=output_data, columns=output_cols.keys(), index=[11, 23]
-        )
-        output_df = output_df.astype(output_cols)
-
-        return output_df
-
-    def test_forward_imputation(self):
-        """Test the expected functionality"""
-
-        input_df = self.input_data_forward_imputation()
-        expout_dict = self.output_data_forward_imputation()
-
-        column = "current_period_class"
-        target_variables_list = ["var1"]
-        current_period = "current_period"
-        previous_period = "previous_period"
-
-        df_result = forward_imputation(
-            input_df, column, target_variables_list, current_period, previous_period
-        )
-
-        assert_frame_equal(df_result, expout_dict)
-
-
-class TestBackwardsImputation:
-    """Unit test for backwards_imputation"""
-
-    def input_data_backwards_imputation(self):
-        """Create input data for the backwards_imputation function"""
-
-        # columns for the dataframe
-        input_cols = {
-            "status": "str",
-            "current_period_class": "str",
-            "200": "str",
-            "201": "str",
-            "current_period_var1": "Int64",
-            "previous_period_var1": "Int64",
-            "employees": "Int64",
-            "reference": "Int64",
-            "trim": "str",
-        }
-
-        # data in the column order above
-        input_data = [
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"],
-            ["Clear", "class1", "C", "G", 4, np.nan, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"],
-            ["Clear", "class2", "D", "G", 6, np.nan, 1, 1, "dont trim"],
-        ]  # (more than 10 rows per class)
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols.keys())
-        input_df = input_df.astype(input_cols)
-
-        return input_df
-
-    def output_data_backwards_imputation(self):
-        """Create output data for the backwards_imputation function"""
-
-        # columns for the dataframe
-        output_cols = {
-            "status": "str",
-            "current_period_class": "str",
-            "200": "str",
-            "201": "str",
-            "current_period_var1": "Int64",
-            "previous_period_var1": "Int64",
-            "employees": "Int64",
-            "reference": "Int64",
-            "trim": "str",
-            "backwards_imputed_var1": "Int64",
-        }
-
-        # data in the column order above
-        output_data = [
-            ["Clear", "class1", "C", "G", 4, np.nan, 1, 1, "dont trim", 1],
-            ["Clear", "class2", "D", "G", 6, np.nan, 1, 1, "dont trim", 1],
-        ]  # (more than 10 rows per class)
-
-        # Create a pandas dataframe
-        output_df = pandasDF(
-            data=output_data, columns=output_cols.keys(), index=[11, 23]
-        )
-        output_df = output_df.astype(output_cols)
-
-        return output_df
-
-    def test_backwards_imputation(self):
-        """Test the expected functionality"""
-
-        input_df = self.input_data_backwards_imputation()
-        expout_df = self.output_data_backwards_imputation()
-
-        column = "current_period_class"
-        target_variables_list = ["var1"]
-        current_period = "current_period"
-        previous_period = "previous_period"
-
-        df_result = backwards_imputation(
-            input_df, column, target_variables_list, current_period, previous_period
-        )
-
-        assert_frame_equal(df_result, expout_df)
-
-
-class TestRunImputation:
-    """Unit test for run_imputation"""
-
-    def input_data_run_imputation(self):
-        """Create input data for the run_imputation function"""
-        # Currently input_df isn't being used as
-        # fake data is hard coded into
-        # function until ingest is firmed down
-
-        # columns for the dataframe
-        input_cols = {
-            "status": "str",
-            "reference": "Int64",
-            "civ_or_def": "str",
-            "Product_group": "str",
-            "employees": "Int64",
-            "202012_var1": "Int64",
-            "202012_var2": "Int64",
-            "202009_var1": "Int64",
-            "202009_var2": "Int64",
-        }
-
-        # data in the column order above
-        input_data = [
-            ["Clear", 1, "2", "A", 100, 1, 1, 1, 3],
-            ["Clear", 2, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 3, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 4, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 5, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 6, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 7, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 8, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 9, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 10, "2", "A", 100, 11, 1, 10, 3],
-            ["Clear", 11, "2", "A", 100, 110, 1, 100, 3],
-            ["Clear", 12, "2", "A", 100, np.nan, 1, 10, 3],
-            ["Clear", 13, "2", "B", 100, 1, 1, 1, 3],
-            ["Clear", 14, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 15, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 16, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 17, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 18, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 19, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 20, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 21, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 22, "2", "B", 100, 11, 1, 10, 3],
-            ["Clear", 23, "2", "B", 100, 110, 1, 100, 3],
-            ["Clear", 24, "2", "B", 100, 11, 1, 10, np.nan],
-        ]  # (more than 10 rows per class)
-
-        # Create a pandas dataframe
-        input_df = pandasDF(data=input_data, columns=input_cols.keys())
-        input_df = input_df.astype(input_cols)
-
-        return input_df
-
-    def output_data_run_imputation(self):
-        """Create output data for the run_imputation function"""
-        output_cols_f = {
-            "status": "str",
-            "reference": "Int64",
-            "200": "str",
-            "201": "str",
-            "employees": "Int64",
-            "202012_var1": "Int64",
-            "202012_var2": "Int64",
-            "202009_var1": "Int64",
-            "202009_var2": "Int64",
-            "202012_class": "str",
-            "forwards_imputed_var1": "Int64",
-            "forwards_imputed_var2": "Int64",
-        }
-
-        output_data_for = [
-            ["Clear", 12, "2", "A", 100, np.nan, 1, 10, 3, "2_A", 11, np.nan],
-        ]  # (more than 10 rows per class)
-
-        output_df_for = pandasDF(
-            data=output_data_for, columns=output_cols_f.keys(), index=[11]
-        ).astype(output_cols_f)
-
-        # TODO check data types and update headers
-        # when using real data
-        # columns for the dataframe
-        output_cols_b = {
-            "status": "str",
-            "reference": "Int64",
-            "200": "str",
-            "201": "str",
-            "employees": "Int64",
-            "202012_var1": "Int64",
-            "202012_var2": "Int64",
-            "202009_var1": "Int64",
-            "202009_var2": "Int64",
-            "202012_class": "str",
-            "backwards_imputed_var1": "Int64",
-            "backwards_imputed_var2": "Int64",
-        }
-
-        # TODO check data types and update headers
-        # when using real data
-        # data in the column order above
-        output_data_back = [
-            ["Clear", 24, "2", "B", 100, 11, 1, 10, np.nan, "2_B", np.nan, 3],
-        ]  # (more than 10 rows per class)
-
-        output_df_back = pandasDF(
-            data=output_data_back, columns=output_cols_b.keys(), index=[23]
-        ).astype(output_cols_b)
-
-        return output_df_for, output_df_back
-
-    def test_run_imputation(self):
-        """Test the expected functionality"""
-
-        input_df = self.input_data_run_imputation()
-        expout_df_for, expout_df_back = self.output_data_run_imputation()
-
-        target_variables_list = ["var1", "var2"]
-        current_period = "202012"
-        previous_period = "202009"
-        result_for, result_back = run_imputation(
-            input_df, target_variables_list, current_period, previous_period
-        )
-        pd.set_option("display.max_rows", None)
-        pd.set_option("display.max_columns", None)
-        pd.set_option("display.width", 2000)
-        print(result_for)
-        assert_frame_equal(result_for, expout_df_for)
-        assert_frame_equal(result_back, expout_df_back)
-
-
-class TestUpdateImputed:
-    """Unit test for update_imputed"""
-
-    def input_data_update_imputed(self):
-        """Create input data for the update_imputed function"""
-
-        # columns for the dataframe
-        input_cols_full = [
-            "reference",
-            "col2",
-        ]
-
-        # data in the column order above
-        input_data_full = [
-            [1.0, 1.0],
-            [2.0, np.nan],
-        ]
-
-        # Create a pandas dataframe
-        input_full = pandasDF(data=input_data_full, columns=input_cols_full)
-
-        # columns for the dataframe
-        input_cols_imputed = [
-            "reference",
-            "forwards_imputed_col2",
-        ]
-
-        # data in the column order above
-        input_data_imputed = [
-            [2.0, 1.0],
-        ]
-
-        # Create a pandas dataframe
-        input_imputed = pandasDF(data=input_data_imputed, columns=input_cols_imputed)
-
-        return input_full, input_imputed
-
-    def output_data_update_imputed(self):
-        """Create output data for the update_imputed function"""
-
-        # columns for the dataframe
-        output_cols = ["reference", "col2", "imputation_marker"]
-
-        # data in the column order above
-        output_data = [
-            [1.0, 1.0, "response"],
-            [2.0, 1.0, "forwards_imputed"],
-        ]  # (more than 10 rows per class)
-
-        # Create a pandas dataframe
-        output_df = pandasDF(data=output_data, columns=output_cols)
-
-        return output_df
-
-    def test_update_imputed(self):
-        """Test the expected functionality"""
-
-        input_full, input_imputed = self.input_data_update_imputed()
-        output_df = self.output_data_update_imputed()
-
-        target_variables_list = ["col2"]
-        direction = "forwards"
-
-        df_result = update_imputed(
-            input_full, input_imputed, target_variables_list, direction
-        )
-
-        assert_frame_equal(df_result, output_df)

From 799802f936990a9113521e21cc679e7a9cbabe05 Mon Sep 17 00:00:00 2001
From: jwestw <james.westwood@ons.gov.uk>
Date: Thu, 11 Jan 2024 15:40:01 +0000
Subject: [PATCH 10/26] Changes requested during joint review

---
 src/imputation/imputation_helpers.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py
index 0e6622a41..75c1142d6 100644
--- a/src/imputation/imputation_helpers.py
+++ b/src/imputation/imputation_helpers.py
@@ -222,13 +222,13 @@ def fill_sf_zeros(df: pd.DataFrame) -> pd.DataFrame:
 
 
 def tidy_imputation_dataframe(
-        df: pd.DataFrame,
-        config: Dict,
-        logger,
-        to_impute_cols: List,
-        write_csv: Callable,
-        run_id: int,
-        ) -> pd.DataFrame:
+    df: pd.DataFrame,
+    config: Dict,
+    logger: logging.Logger,
+    to_impute_cols: List,
+    write_csv: Callable,
+    run_id: int,
+) -> pd.DataFrame:
     """Remove rows and columns not needed after imputation."""
     # Create lists for the qa cols
     imp_cols = [f"{col}_imputed" for col in to_impute_cols]
@@ -247,15 +247,15 @@ def tidy_imputation_dataframe(
     to_drop += ["200_imp_marker", "211_trim", "305_trim", "manual_trim"]
     df = df.drop(columns=to_drop)
 
-    # Keep only clear and imputed records
-    imputed_statuses = ["TMI", "CF", "MoR", "constructed"]
-    to_keep = df["imp_marker"].isin(imputed_statuses) | (df["imp_marker"] == "R")
+    # Keep only imputed records and clear ("R")
+    imp_markers_to_keep = ["TMI", "CF", "MoR", "constructed"]
+    to_keep = df["imp_marker"].isin(imp_markers_to_keep) | (df["imp_marker"] == "R")
 
     to_keep_df = df.copy().loc[to_keep]
     filtered_output_df = df.copy().loc[~to_keep]
 
     # change the value of the status column to 'imputed' for imputed statuses
-    condition = to_keep_df["status"].isin(imputed_statuses)
+    condition = to_keep_df["imp_marker"].isin(imp_markers_to_keep)
     to_keep_df.loc[condition, "status"] = "imputed"
 
     # Running status filtered full dataframe output for QA

From a01f5f201ee5661edcce873279808b1cb931f5a0 Mon Sep 17 00:00:00 2001
From: Tom Coates <tom.coates@ons.gov.uk>
Date: Thu, 11 Jan 2024 15:56:56 +0000
Subject: [PATCH 11/26] fix shortform and postcode bugs in construction

---
 src/construction/construction.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/construction/construction.py b/src/construction/construction.py
index 484ee4e51..b5f043cde 100644
--- a/src/construction/construction.py
+++ b/src/construction/construction.py
@@ -4,6 +4,7 @@
 from typing import Callable
 
 from src.staging.validation import validate_data_with_schema
+from src.staging.staging_helpers import postcode_topup
 from src.outputs.outputs_helpers import create_period_year
 
 construction_logger = logging.getLogger(__name__)
@@ -87,6 +88,10 @@ def run_construction(
         updated_snapshot_df = create_period_year(updated_snapshot_df)
         construction_df = create_period_year(construction_df)
 
+    # Set instance=1 so longforms with status 'Form sent out' match correctly
+    form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out")
+    updated_snapshot_df.loc[form_sent_condition, "instance"] = 1
+
     # NI data has no instance but needs an instance of 1
     if is_northern_ireland:
         construction_df["instance"] = 1
@@ -115,6 +120,19 @@ def run_construction(
         {"reference": "Int64", "instance": "Int64", "period_year": "Int64"}
     )
 
+    # Long form records with a postcode in 601 use this as the postcode 
+    long_form_cond = (~updated_snapshot_df["601"].isnull())
+    updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"]
+
+    # Short form records with nothing in 601 use referencepostcode instead
+    short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull())
+    updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"]
+
+    # Top up all new postcodes so they're all eight characters exactly
+    postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"]
+    for col in postcode_cols:
+        updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup)
+
     construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.")
 
     return updated_snapshot_df

From 79fe040faf3af483a665cc35cffbf8e5a5214228 Mon Sep 17 00:00:00 2001
From: Cheshire <Jen.Cheshire@ons.gov.uk>
Date: Thu, 11 Jan 2024 16:21:41 +0000
Subject: [PATCH 12/26] RDRP-646: basic functionality added

---
 .pre-commit-config.yaml          | 22 +++++++++++-----------
 src/construction/construction.py | 25 +++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e7f65f642..fc5ea89b9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,17 +9,17 @@ repos:
             args:
                 - --extra-keys
                 - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId"
-    - repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v4.0.1
-      hooks:
-          # - id: check-added-large-files
-          #   name: Check for files larger than 5 MB
-          #   args: ["--maxkb=5120"]
-          - id: end-of-file-fixer
-            name: Check for a blank line at the end of scripts (auto-fixes)
-            exclude: '\.Rd'
-          - id: trailing-whitespace
-            name: Check for trailing whitespaces (auto-fixes)
+    # - repo: https://github.com/pre-commit/pre-commit-hooks
+    #   rev: v4.0.1
+    #   hooks:
+    #       # - id: check-added-large-files
+    #       #   name: Check for files larger than 5 MB
+    #       #   args: ["--maxkb=5120"]
+    #       - id: end-of-file-fixer
+    #         name: Check for a blank line at the end of scripts (auto-fixes)
+    #         exclude: '\.Rd'
+    #       - id: trailing-whitespace
+    #         name: Check for trailing whitespaces (auto-fixes)
     - repo: https://github.com/pycqa/isort
       rev: 5.8.0
       hooks:
diff --git a/src/construction/construction.py b/src/construction/construction.py
index 484ee4e51..dc1751eed 100644
--- a/src/construction/construction.py
+++ b/src/construction/construction.py
@@ -77,6 +77,9 @@ def run_construction(
     validate_data_with_schema(construction_df, schema_path)
     construction_df = construction_df.dropna(axis="columns", how="all")
 
+    # Prepare the short to long form constructions
+    updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df)
+
     # Add flags to indicate whether a row was constructed or should be imputed
     updated_snapshot_df["is_constructed"] = False
     updated_snapshot_df["force_imputation"] = False
@@ -115,6 +118,28 @@ def run_construction(
         {"reference": "Int64", "instance": "Int64", "period_year": "Int64"}
     )
 
+    updated_snapshot_df = updated_snapshot_df.sort_values(
+        ["reference", "instance"], ascending=[True, True]
+    ).reset_index(drop=True)
+
     construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.")
 
     return updated_snapshot_df
+
+
+def prepare_short_to_long(updated_snapshot_df, construction_df):
+    """Create addional instances for short to long construction"""
+    # Check which references are going to converted to long forms
+    short_to_long_refs = construction_df.loc[construction_df["short_to_long"] == True, "reference"].unique()
+    # Create conversion df
+    short_to_long_df = updated_snapshot_df[updated_snapshot_df["reference"].isin(short_to_long_refs)]
+
+    # Copy instance 0 record to create instance 1 and instance 2
+    short_to_long_df1 = short_to_long_df.copy()
+    short_to_long_df1["instance"] = 1
+    short_to_long_df2 = short_to_long_df.copy()
+    short_to_long_df2["instance"] = 2
+
+    # Add new instances to the updated snapshot df
+    updated_snapshot_df = pd.concat([updated_snapshot_df, short_to_long_df1, short_to_long_df2])
+    return updated_snapshot_df

From 306405999d6f39492a4e41204c19537e046ccc14 Mon Sep 17 00:00:00 2001
From: Cheshire <Jen.Cheshire@ons.gov.uk>
Date: Thu, 11 Jan 2024 17:21:59 +0000
Subject: [PATCH 13/26] RDRP-646: pre-commits and change to flag order

---
 .pre-commit-config.yaml          | 22 +++++++++++-----------
 src/construction/construction.py | 18 ++++++++++++------
 src/developer_config.yaml        |  9 ++++-----
 3 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fc5ea89b9..e7f65f642 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -9,17 +9,17 @@ repos:
             args:
                 - --extra-keys
                 - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId"
-    # - repo: https://github.com/pre-commit/pre-commit-hooks
-    #   rev: v4.0.1
-    #   hooks:
-    #       # - id: check-added-large-files
-    #       #   name: Check for files larger than 5 MB
-    #       #   args: ["--maxkb=5120"]
-    #       - id: end-of-file-fixer
-    #         name: Check for a blank line at the end of scripts (auto-fixes)
-    #         exclude: '\.Rd'
-    #       - id: trailing-whitespace
-    #         name: Check for trailing whitespaces (auto-fixes)
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v4.0.1
+      hooks:
+          # - id: check-added-large-files
+          #   name: Check for files larger than 5 MB
+          #   args: ["--maxkb=5120"]
+          - id: end-of-file-fixer
+            name: Check for a blank line at the end of scripts (auto-fixes)
+            exclude: '\.Rd'
+          - id: trailing-whitespace
+            name: Check for trailing whitespaces (auto-fixes)
     - repo: https://github.com/pycqa/isort
       rev: 5.8.0
       hooks:
diff --git a/src/construction/construction.py b/src/construction/construction.py
index dc1751eed..7daf927ac 100644
--- a/src/construction/construction.py
+++ b/src/construction/construction.py
@@ -77,12 +77,12 @@ def run_construction(
     validate_data_with_schema(construction_df, schema_path)
     construction_df = construction_df.dropna(axis="columns", how="all")
 
-    # Prepare the short to long form constructions
-    updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df)
-
     # Add flags to indicate whether a row was constructed or should be imputed
     updated_snapshot_df["is_constructed"] = False
     updated_snapshot_df["force_imputation"] = False
+
+    # Prepare the short to long form constructions
+    updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df)
     construction_df["is_constructed"] = True
 
     # Create period_year column, except for NI which already has it
@@ -130,9 +130,13 @@ def run_construction(
 def prepare_short_to_long(updated_snapshot_df, construction_df):
     """Create addional instances for short to long construction"""
     # Check which references are going to converted to long forms
-    short_to_long_refs = construction_df.loc[construction_df["short_to_long"] == True, "reference"].unique()
+    short_to_long_refs = construction_df.loc[
+        construction_df["short_to_long"] == True,"reference"
+        ].unique()
     # Create conversion df
-    short_to_long_df = updated_snapshot_df[updated_snapshot_df["reference"].isin(short_to_long_refs)]
+    short_to_long_df = updated_snapshot_df[
+        updated_snapshot_df["reference"].isin(short_to_long_refs)
+    ]
 
     # Copy instance 0 record to create instance 1 and instance 2
     short_to_long_df1 = short_to_long_df.copy()
@@ -141,5 +145,7 @@ def prepare_short_to_long(updated_snapshot_df, construction_df):
     short_to_long_df2["instance"] = 2
 
     # Add new instances to the updated snapshot df
-    updated_snapshot_df = pd.concat([updated_snapshot_df, short_to_long_df1, short_to_long_df2])
+    updated_snapshot_df = pd.concat(
+        [updated_snapshot_df, short_to_long_df1, short_to_long_df2]
+    )
     return updated_snapshot_df
diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index e07316c20..d34950dbd 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -9,10 +9,10 @@ global:
   # Staging and validation settings
   postcode_csv_check: False
   load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
-  load_ni_data: True
+  load_ni_data: False
   load_historic_data: False
-  run_construction: False
-  run_ni_construction: True
+  run_construction: True
+  run_ni_construction: False
   load_manual_outliers: False
   load_manual_imputation: False
   load_backdata: True  # whether to load previous year data for MoR
@@ -23,7 +23,6 @@ global:
   # Output settings
   output_full_responses: False
   output_ni_full_responses: False
-  output_imputation_qa: False
   output_auto_outliers: False
   output_outlier_qa : False
   output_estimation_qa: False
@@ -88,7 +87,7 @@ network_paths:
   backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv"
   outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers"
   manual_outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
-  construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_file.csv"
+  construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/JC_test_construction_file.csv"
   construction_file_path_ni: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_ni_file.csv"
   # construction_add_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_additions_2023-11-06_v5.csv" # TODO Need to test
   # construction_amend_path:  "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_amendments_2023-10-31_v2.csv" # TODO Need to test

From 7fdcfa08dd87844b3235e75bbe211724b1a6c53e Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Thu, 11 Jan 2024 17:47:45 +0000
Subject: [PATCH 14/26] update tests

---
 src/outputs/form_output_prep.py          |   8 +-
 src/outputs/gb_sas.py                    |  15 ---
 src/outputs/outputs_main.py              |   2 -
 src/outputs/tau.py                       |  13 --
 src/staging/pg_conversion.py             | 152 ++++++++++-----------
 src/staging/staging_main.py              |   1 -
 tests/test_staging/test_pg_conversion.py | 160 ++++++++++++++++-------
 7 files changed, 187 insertions(+), 164 deletions(-)

diff --git a/src/outputs/form_output_prep.py b/src/outputs/form_output_prep.py
index 0e3898484..7b2401562 100644
--- a/src/outputs/form_output_prep.py
+++ b/src/outputs/form_output_prep.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from src.staging.pg_conversion import run_pg_conversion
+from src.staging.pg_conversion import sic_to_pg_mapper
 from src.staging.validation import flag_no_rand_spenders
 
 
@@ -59,9 +59,9 @@ def form_output_prep(
         ni_full_responses["form_status"] = 600
         ni_full_responses["602"] = 100
         ni_full_responses["formtype"] = "0003"
-        ni_full_responses = run_pg_conversion(
-            ni_full_responses, pg_num_alpha, sic_pg_alpha, target_col="201"
-        )
+
+        # Update column 201 (currently PG numeric) to alpha-numeric, mapping from SIC.
+        ni_full_responses = sic_to_pg_mapper(ni_full_responses, sic_pg_alpha)
 
         # outputs_df = pd.concat([outputs_df, ni_full_responses])
         tau_outputs_df = pd.concat([tau_outputs_df, ni_full_responses])
diff --git a/src/outputs/gb_sas.py b/src/outputs/gb_sas.py
index 4435a465f..73ffaad8c 100644
--- a/src/outputs/gb_sas.py
+++ b/src/outputs/gb_sas.py
@@ -7,7 +7,6 @@
 import src.outputs.map_output_cols as map_o
 from src.staging.validation import load_schema
 from src.outputs.outputs_helpers import create_output_df, regions
-from src.staging.pg_conversion import sic_to_pg_mapper
 
 GbSasLogger = logging.getLogger(__name__)
 
@@ -19,7 +18,6 @@ def output_gb_sas(
     run_id: int,
     ultfoc_mapper: pd.DataFrame,
     postcode_mapper: pd.DataFrame,
-    sic_pg_num: pd.DataFrame,
 ):
     """Run the outputs module.
 
@@ -31,8 +29,6 @@ def output_gb_sas(
         run_id (int): The current run id
         ultfoc_mapper (pd.DataFrame): The ULTFOC mapper DataFrame.
         postcode_mapper (pd.DataFrame): maps the postcode to region code
-        pg_alpha_num (pd.DataFrame): mapper of numeric PG to alpha PG
-
     """
 
     NETWORK_OR_HDFS = config["global"]["network_or_hdfs"]
@@ -47,20 +43,9 @@ def output_gb_sas(
     # Join foriegn ownership column using ultfoc mapper
     df1 = map_o.join_fgn_ownership(df1, ultfoc_mapper)
 
-    # Fill in numeric PG for short forms and imputed long forms
-    df1 = sic_to_pg_mapper(
-        df1,
-        sic_pg_num,
-        target_col="pg_numeric",
-        from_col="SIC 2007_CODE",
-        to_col="2016 > Form PG",
-        formtype=["0006", "0001"],
-    )
-
     # Map to the CORA statuses from the statusencoded column
     df1 = map_o.create_cora_status_col(df1)
 
-
     # Map the sizebands based on frozen employment
     df1 = map_o.map_sizebands(df1)
 
diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py
index c61280772..161c0be35 100644
--- a/src/outputs/outputs_main.py
+++ b/src/outputs/outputs_main.py
@@ -123,7 +123,6 @@ def run_outputs(
             run_id,
             ultfoc_mapper,
             postcode_mapper,
-            sic_pg_num,
         )
         OutputMainLogger.info("Finished TAU output.")
 
@@ -137,7 +136,6 @@ def run_outputs(
             run_id,
             ultfoc_mapper,
             postcode_mapper,
-            sic_pg_num,
         )
         OutputMainLogger.info("Finished GB SAS output.")
 
diff --git a/src/outputs/tau.py b/src/outputs/tau.py
index e06c70a83..329ab32d3 100644
--- a/src/outputs/tau.py
+++ b/src/outputs/tau.py
@@ -18,7 +18,6 @@ def output_tau(
     run_id: int,
     ultfoc_mapper: pd.DataFrame,
     postcode_itl_mapper: pd.DataFrame,
-    sic_pg_num: pd.DataFrame,
 ):
     """Run the outputs module.
 
@@ -30,8 +29,6 @@ def output_tau(
         run_id (int): The current run id
         ultfoc_mapper (pd.DataFrame): The ULTFOC mapper DataFrame.
         postcode_itl_mapper (pd.DataFrame): maps the postcode to region code
-        pg_alpha_num (pd.DataFrame): mapper of alpha PG to numeric PG
-
     """
 
     NETWORK_OR_HDFS = config["global"]["network_or_hdfs"]
@@ -49,16 +46,6 @@ def output_tau(
     # Join foriegn ownership column using ultfoc mapper
     df = map_o.join_fgn_ownership(df, ultfoc_mapper, formtype=["0001", "0006"])
 
-    # Fill in numeric PG for short forms and imputed long forms
-    df = sic_to_pg_mapper(
-        df,
-        sic_pg_num,
-        target_col="pg_numeric",
-        from_col="SIC 2007_CODE",
-        to_col="2016 > Form PG",
-        formtype=["0006", "0001", "0003"],
-    )
-
     # Map to the CORA statuses from the statusencoded column
     df = map_o.create_cora_status_col(df)
 
diff --git a/src/staging/pg_conversion.py b/src/staging/pg_conversion.py
index c6fc43aee..5fbca575c 100644
--- a/src/staging/pg_conversion.py
+++ b/src/staging/pg_conversion.py
@@ -5,24 +5,34 @@
 PgLogger = logging.getLogger(__name__)
 
 
-def pg_to_pg_mapper(
+def sic_to_pg_mapper(
     df: pd.DataFrame,
-    mapper: pd.DataFrame,
-    target_col: str = "product_group",
+    sicmapper: pd.DataFrame,
     pg_column: str = "201",
-    from_col: str = "pg_numeric",
-    to_col: str = "pg_alpha",
+    sic_column: str = "rusic",
+    from_col: str = "SIC 2007_CODE",
+    to_col: str = "2016 > Form PG",
 ):
-    """This function maps all values in one column to another column
-    using a mapper file. This is applied to long forms only.
-    The default this is used for is PG numeric to letter conversion.
+    """Map from SIC code to PG numeric code where PG numeric is null.
+
+    Example initial dataframe:
+        reference | 201     | rusic
+    --------------------------------
+        1         | 53      | 2500   
+        2         | NaN     | 1600
+        3         | NaN     | 4300
+
+    returned dataframe:
+        reference | 201     | rusic
+    --------------------------------
+        1         | 53      | 2500   
+        2         | 45      | 1600
+        3         | 38      | 4300
 
     Args:
-        df (pd.DataFrame): The dataset containing all the PG numbers
-        mapper (pd.DataFrame): The mapper dataframe loaded using custom function
-        target_col (str, optional): The column we output the
-        mapped values to (product_group).
-        pg_column (str, optional): The column we want to convert (201).
+        df (pd.DataFrame): The dataset containing all the PG numbers.
+        sicmapper (pd.DataFrame): The SIC to pg numeric mapper.
+        sic_column (str, optional): The column containing the SIC numbers.
         from_col (str, optional): The column in the mapper that is used to map from.
         to_col (str, optional): The column in the mapper that is used to map to.
 
@@ -30,15 +40,11 @@ def pg_to_pg_mapper(
         pd.DataFrame: A dataframe with all target column values mapped
     """
 
-    filtered_df = df.copy()
-
-    if "formtype" in filtered_df.columns:
-        formtype_cond = filtered_df["formtype"] == "0001"
-        filtered_df = filtered_df[formtype_cond]
+    df = df.copy()
 
     # Create a mapping dictionary from the 2 columns
-    map_dict = dict(zip(mapper[from_col], mapper[to_col]))
-    # Flag all PGs that don't have a corresponding map value
+    map_dict = dict(zip(sicmapper[from_col], sicmapper[to_col]))
+    # Flag all SIC numbers that don't have a corresponding map value
     mapless_errors = []
     for key, value in map_dict.items():
         if str(value) == "nan":
@@ -46,45 +52,49 @@ def pg_to_pg_mapper(
 
     if mapless_errors:
         PgLogger.error(
-            f"Mapping doesnt exist for the following product groups: {mapless_errors}"
+            f"Mapping doesnt exist for the following SIC numbers: {mapless_errors}"
         )
-    # Map using the dictionary taking into account the null values.
-    # Then convert to categorigal datatype
-    filtered_df[pg_column] = pd.to_numeric(filtered_df[pg_column], errors="coerce")
-    filtered_df[target_col] = filtered_df[pg_column].map(map_dict)
-    filtered_df[target_col] = filtered_df[target_col].astype("category")
+    # Map to the target column using the dictionary, null values only
+    df.loc[df[pg_column].isnull(), pg_column] = (
+        df.loc[df[pg_column].isnull(), sic_column].map(map_dict)
+    )
 
-    df.loc[
-        filtered_df.index,
-        f"{target_col}",
-    ] = filtered_df[target_col]
-
-    PgLogger.info("Product groups successfully mapped to letters")
+    PgLogger.info("Product group nulls successfully mapped from SIC.")
 
     return df
 
 
-def sic_to_pg_mapper(
+def pg_to_pg_mapper(
     df: pd.DataFrame,
-    sicmapper: pd.DataFrame,
-    target_col: str = "product_group",
-    sic_column: str = "rusic",
-    from_col: str = "sic",
+    mapper: pd.DataFrame,
+    pg_column: str = "201",
+    from_col: str = "pg_numeric",
     to_col: str = "pg_alpha",
-    formtype: str = ["0006"],
 ):
-    """This function maps all values in one column to another column
-    using a mapper file. This is only applied for short forms and unsampled
-    refs.
+    """Map from PG numeric to PG alpha-numeric and create a new column.
+
+    The product group column (default: column 201) coped to a new column, "pg_numeric",
+    and then is updated from numeric to alpha-numeric using a mapping.
+
+    Example initial dataframe:
+        reference | 201     
+    ----------------------
+        1         | 53    
+        2         | 43     
+        3         | 33    
+
+    returned dataframe:
+        reference | 201     | pg_numeric
+    ------------------------------------
+        1         | AA      | 33
+        2         | B       | 43
+        3         | E       | 53
 
-    The default this is used for is PG numeric to letter conversion.
 
     Args:
-        df (pd.DataFrame): The dataset containing all the PG numbers.
-        sicmapper (pd.DataFrame): The mapper dataframe loaded using custom function.
-        target_col (str, optional): The column we output the
-        mapped values to (product_group).
-        sic_column (str, optional): The column containing the SIC numbers.
+        df (pd.DataFrame): The dataframe requiring mapping
+        mapper (pd.DataFrame): the PG numeric to alpha-numeric mapper
+        pg_column (str, optional): The column we want to convert (default 201).
         from_col (str, optional): The column in the mapper that is used to map from.
         to_col (str, optional): The column in the mapper that is used to map to.
 
@@ -92,16 +102,15 @@ def sic_to_pg_mapper(
         pd.DataFrame: A dataframe with all target column values mapped
     """
 
-    filtered_df = df.copy()
-
-    filtered_df = filtered_df[filtered_df["formtype"].isin(formtype)]
+    df = df.copy()
 
-    if "pg_numeric" in filtered_df.columns:
-        filtered_df = filtered_df[filtered_df["pg_numeric"].isnull()]
+    # Copy the numeric PG column to a new column
+    df["pg_numeric"] = df[pg_column].copy()
 
     # Create a mapping dictionary from the 2 columns
-    map_dict = dict(zip(sicmapper[from_col], sicmapper[to_col]))
-    # Flag all SIC numbers that don't have a corresponding map value
+    map_dict = dict(zip(mapper[from_col], mapper[to_col]))
+
+    # Flag all PGs that don't have a corresponding map value
     mapless_errors = []
     for key, value in map_dict.items():
         if str(value) == "nan":
@@ -109,22 +118,15 @@ def sic_to_pg_mapper(
 
     if mapless_errors:
         PgLogger.error(
-            f"Mapping doesnt exist for the following SIC numbers: {mapless_errors}"
+            f"Mapping doesnt exist for the following product groups: {mapless_errors}"
         )
-    # Map to the target column using the dictionary taking into account the null values.
-    # Then convert to categorigal datatype
-    filtered_df[sic_column] = pd.to_numeric(filtered_df[sic_column], errors="coerce")
-    filtered_df[target_col] = filtered_df[sic_column].map(map_dict)
-    filtered_df[target_col] = filtered_df[target_col].astype("category")
 
-    df = df.copy()
+    df[pg_column] = df[pg_column].map(map_dict)
 
-    df.loc[
-        filtered_df.index,
-        f"{target_col}",
-    ] = filtered_df[target_col]
+    # Then convert the pg column and the new column to categorigal datatypes
+    df = df.astype({pg_column: "category", "pg_numeric": "category"})
 
-    PgLogger.info("SIC numbers successfully mapped to PG letters")
+    PgLogger.info("Numeric product groups successfully mapped to letters.")
 
     return df
 
@@ -147,22 +149,10 @@ def run_pg_conversion(
     Returns:
         (pd.DataFrame): Dataframe with mapped values
     """
+    # Where the
+    df = sic_to_pg_mapper(df, sic_pg_alpha, )
 
-    df["pg_numeric"] = df["201"].copy()
-
-    if target_col == "201":
-        target_col = "201_mapping"
-    else:
-        # Create a new column to store PGs
-        df[target_col] = np.nan
-
-    # SIC mapping for short forms
-    df = sic_to_pg_mapper(df, sic_pg_alpha, target_col=target_col)
-
-    # SIC mapping for NI
-    df = sic_to_pg_mapper(df, sic_pg_alpha, target_col=target_col, formtype=["0003"])
-
-    # PG mapping for long forms
+    # PG numeric to alpha_numeric mapping for long forms
     df = pg_to_pg_mapper(df, pg_num_alpha, target_col=target_col)
 
     # Overwrite the 201 column if target_col = 201
diff --git a/src/staging/staging_main.py b/src/staging/staging_main.py
index 8da3cbffd..6d072a475 100644
--- a/src/staging/staging_main.py
+++ b/src/staging/staging_main.py
@@ -217,7 +217,6 @@ def run_staging(
             backdata = pg.pg_to_pg_mapper(
                 backdata,
                 pg_num_alpha,
-                target_col="q201",
                 pg_column="q201",
             )
         StagingMainLogger.info("Backdata File Loaded Successfully...")
diff --git a/tests/test_staging/test_pg_conversion.py b/tests/test_staging/test_pg_conversion.py
index a77c2b9f5..0fa74af0c 100644
--- a/tests/test_staging/test_pg_conversion.py
+++ b/tests/test_staging/test_pg_conversion.py
@@ -8,72 +8,136 @@
 
 
 @pytest.fixture
-def dummy_data() -> pd.DataFrame:
+def sic_dummy_data() -> pd.DataFrame:
     # Set up the dummyinput  data
-    data = pd.DataFrame(
-        {"201": [0, 1, 2, 3, 4], "formtype": ["0001", "0001", "0001", "0001", "0001"]}
-    )
-    return data
+    columns = ["201", "rusic"]
+    data = [
+        [53, 2500],
+        [np.nan, 1600],
+        [np.nan, 4300],
+    ]
+
+    return pd.DataFrame(data, columns=columns)
 
 
 @pytest.fixture
-def mapper() -> pd.DataFrame:
-    # Set up the dummy mapper data
-    mapper = {
-        "pg_numeric": [0, 1, 2, 3, 4],
-        "pg_alpha": [np.nan, "A", "B", "C", "C"],
-    }
-    return pd.DataFrame(mapper)
+def sic_mapper():
+    columns = ["sic", "pg"]
+    mapper_rows = [
+        [1600, 36],
+        [2500, 95],
+        [7300, 45],
+        [2500, 53],
+    ]
+
+    # Create the DataFrame
+    return pd.DataFrame(mapper_rows, columns=columns)
 
 
 @pytest.fixture
-def expected_output() -> pd.DataFrame:
+def sic_expected_output() -> pd.DataFrame:
     # Set up the dummy output data
-    expected_output = pd.DataFrame(
-        {
-            "201": [np.nan, "A", "B", "C", "C"],
-            "formtype": ["0001", "0001", "0001", "0001", "0001"],
-        }
-    )
+    columns = ["201", "rusic"]
+    data = [
+        [53, 2500],
+        [36, 1600],
+        [np.nan, 4300],
+    ]
 
-    expected_output["201"] = expected_output["201"].astype("category")
-    return expected_output
+    return pd.DataFrame(data, columns=columns)
 
 
-@pytest.fixture
-def sic_dummy_data() -> pd.DataFrame:
-    # Set up the dummyinput  data
-    data = pd.DataFrame(
-        {"rusic": [1110, 10101], "201": [np.nan, np.nan], "formtype": ["0006", "0006"]}
-    )
-    return data
+def test_sic_mapper(sic_dummy_data, sic_expected_output, sic_mapper):
+    """Tests for pg mapper function."""
 
+    expected_output_data = sic_expected_output
 
-@pytest.fixture
-def sic_mapper() -> pd.DataFrame:
-    # Set up the dummy mapper data
-    mapper = {
-        "sic": [1110, 10101],
-        "pg_alpha": ["A", "B"],
-    }
-    return pd.DataFrame(mapper)
+    df_result = sic_to_pg_mapper(
+        sic_dummy_data, 
+        sic_mapper,
+        pg_column="201",
+        from_col="sic",
+        to_col="pg",
+        )
+
+    pd.testing.assert_frame_equal(df_result, expected_output_data)
 
 
 @pytest.fixture
-def sic_expected_output() -> pd.DataFrame:
-    # Set up the dummy output data
-    expected_output = pd.DataFrame(
-        {"rusic": [1110, 10101], "201": ["A", "B"], "formtype": ["0006", "0006"]}
-    )
-    expected_output["201"] = expected_output["201"].astype("category")
-    return expected_output
+def mapper():
+    mapper_rows = [
+        [36, "N"],
+        [37, "Y"],
+        [45, "AC"],
+        [47, "AD"],
+        [49, "AD"],
+        [50, "AD"],
+        [58, "AH"],
+    ]
+    columns = ["pg_numeric", "pg_alpha"]
 
+    # Create the DataFrame
+    mapper_df = pd.DataFrame(mapper_rows, columns=columns)
 
-def test_sic_mapper(sic_dummy_data, sic_expected_output, sic_mapper):
-    """Tests for pg mapper function."""
+    # Return the DataFrame
+    return mapper_df
 
-    expected_output_data = sic_expected_output
 
-    df_result = sic_to_pg_mapper(sic_dummy_data, sic_mapper, target_col="201")
+def test_pg_to_pg_mapper_with_many_to_one(mapper):
 
-    pd.testing.assert_frame_equal(df_result, expected_output_data)
+    columns = ["formtype", "201", "other_col"]
+    row_data = [
+        ["0001", 45, "2020"], 
+        ["0001", 49, "2020"], 
+        ["0002", 50, "2020"]
+    ]
+
+    test_df = pd.DataFrame(row_data, columns=columns)
+
+    expected_columns = ["formtype", "201", "other_col", "pg_numeric"]
+
+    expected_data = [
+        ["0001", "AC", "2020", 45],
+        ["0001", "AD", "2020", 49],
+        ["0002", "AD", "2020", 50]
+    ]
+
+    type_dict = {"201": "category", "pg_numeric": "category"}
+
+    # Build the expected result dataframe. Set the dtype of prod group to cat, like the result_df
+    expected_result_df = pd.DataFrame(expected_data, columns=expected_columns)
+    expected_result_df = expected_result_df.astype(type_dict)
+
+    result_df = pg_to_pg_mapper(test_df.copy(), mapper.copy())
+
+    pd.testing.assert_frame_equal(result_df, expected_result_df, check_dtype=False)
+
+
+def test_pg_to_pg_mapper_success(mapper):
+    columns = ["formtype", "201", "other_col"] 
+    row_data = [
+        ["0001", 36, "2020"],
+        ["0001", 45, "2020"],
+        ["0002", 58, "2020"],
+        ["0001", 49, "2020"],
+    ]
+
+    test_df = pd.DataFrame(row_data, columns=columns)
+
+    expected_columns = ["formtype", "201", "other_col", "pg_numeric"]
+    expected_data = [
+        ["0001", "N", "2020", 36],
+        ["0001", "AC", "2020", 45],
+        ["0002", "AH", "2020", 58],
+        ["0001", "AD", "2020", 49],
+    ]
+
+    expected_result_df = pd.DataFrame(
+        expected_data, columns=expected_columns)
+
+    type_dict = {"201": "category", "pg_numeric": "category"}
+    expected_result_df = expected_result_df.astype(type_dict)
+
+    result_df = pg_to_pg_mapper(test_df.copy(), mapper.copy())
+
+    pd.testing.assert_frame_equal(result_df, expected_result_df)

From 53bb0944102de8a4fff388a83d5ec4bf6606b92b Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Thu, 11 Jan 2024 18:38:00 +0000
Subject: [PATCH 15/26] move pg_conversion to imputation

---
 src/imputation/imputation_main.py            | 24 ++++++++++++++++++--
 src/{staging => imputation}/pg_conversion.py | 16 ++++---------
 src/imputation/tmi_imputation.py             | 13 +++--------
 src/outputs/form_output_prep.py              | 10 +++++---
 src/outputs/ni_sas.py                        |  2 +-
 src/outputs/outputs_main.py                  |  4 ++--
 src/outputs/tau.py                           |  1 -
 src/pipeline.py                              |  5 ++--
 tests/test_staging/test_pg_conversion.py     |  2 +-
 9 files changed, 44 insertions(+), 33 deletions(-)
 rename src/{staging => imputation}/pg_conversion.py (91%)

diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py
index a023f982c..34a7172d7 100644
--- a/src/imputation/imputation_main.py
+++ b/src/imputation/imputation_main.py
@@ -7,6 +7,7 @@
 from src.imputation import imputation_helpers as hlp
 from src.imputation import tmi_imputation as tmi
 from src.staging.validation import load_schema
+from src.imputation.pg_conversion import run_pg_conversion, pg_to_pg_mapper
 from src.imputation.apportionment import run_apportionment
 from src.imputation.short_to_long import run_short_to_long
 from src.imputation.MoR import run_mor
@@ -21,7 +22,8 @@
 def run_imputation(
     df: pd.DataFrame,
     manual_trimming_df: pd.DataFrame,
-    mapper: pd.DataFrame,
+    pg_num_alpha: pd.DataFrame,
+    sic_pg_num: pd.DataFrame,
     backdata: pd.DataFrame,
     config: Dict[str, Any],
     write_csv: Callable,
@@ -48,6 +50,11 @@ def run_imputation(
     Returns:
         pd.DataFrame: dataframe with the imputed columns updated
     """
+    # Carry out product group conversion
+    df = run_pg_conversion(
+        df, pg_num_alpha, sic_pg_num, pg_column="201"
+    )
+
     # Apportion cols 4xx and 5xx to create FTE and headcount values
     df = run_apportionment(df)
 
@@ -92,11 +99,24 @@ def run_imputation(
 
     # Run MoR
     if backdata is not None:
+        # Fix for different column names on network vs hdfs
+        if NETWORK_OR_HDFS == "network":
+            # Map PG numeric to alpha in column q201
+            # This isn't done on HDFS as the column is already mapped
+            backdata = pg_to_pg_mapper(
+                backdata,
+                pg_num_alpha,
+                pg_column="q201",
+                from_col= "pg_numeric",
+                to_col="pg_alpha",
+            )
+            backdata = backdata.drop("pg_numeric", axis=1)
+
         lf_target_vars = config["imputation"]["lf_target_vars"]
         df, links_df = run_mor(df, backdata, to_impute_cols, lf_target_vars, config)
 
     # Run TMI for long forms and short forms
-    imputed_df, qa_df = tmi.run_tmi(df, mapper, config)
+    imputed_df, qa_df = tmi.run_tmi(df, config)
 
     # After imputation, correction to ignore the "604" == "No" in any records with
     # Status "check needed"
diff --git a/src/staging/pg_conversion.py b/src/imputation/pg_conversion.py
similarity index 91%
rename from src/staging/pg_conversion.py
rename to src/imputation/pg_conversion.py
index 5fbca575c..4649096a9 100644
--- a/src/staging/pg_conversion.py
+++ b/src/imputation/pg_conversion.py
@@ -134,8 +134,8 @@ def pg_to_pg_mapper(
 def run_pg_conversion(
     df: pd.DataFrame,
     pg_num_alpha: pd.DataFrame,
-    sic_pg_alpha: pd.DataFrame,
-    target_col: str = "201",
+    sic_pg_num: pd.DataFrame,
+    pg_column: str = "201",
 ):
     """Run the product group mapping functions and return a
     dataframe with the correct mapping for each formtype.
@@ -143,21 +143,15 @@ def run_pg_conversion(
     Args:
         df (pd.DataFrame): Dataframe of full responses data
         mapper (pd.DataFrame): The mapper file used for PG conversion
-        target_col (str, optional): The column to be created
-        which stores mapped values.
+        pg_column: The original product group column
 
     Returns:
         (pd.DataFrame): Dataframe with mapped values
     """
     # Where the
-    df = sic_to_pg_mapper(df, sic_pg_alpha, )
+    df = sic_to_pg_mapper(df, sic_pg_num, pg_column)
 
     # PG numeric to alpha_numeric mapping for long forms
-    df = pg_to_pg_mapper(df, pg_num_alpha, target_col=target_col)
-
-    # Overwrite the 201 column if target_col = 201
-    if target_col == "201_mapping":
-        df["201"] = df[target_col]
-        df = df.drop(columns=[target_col])
+    df = pg_to_pg_mapper(df, pg_num_alpha, pg_column)
 
     return df
diff --git a/src/imputation/tmi_imputation.py b/src/imputation/tmi_imputation.py
index ecd170875..c3ea7eaff 100644
--- a/src/imputation/tmi_imputation.py
+++ b/src/imputation/tmi_imputation.py
@@ -3,7 +3,7 @@
 import numpy as np
 from typing import Dict, List, Tuple, Any
 
-from src.staging.pg_conversion import sic_to_pg_mapper
+from src.imputation.pg_conversion import sic_to_pg_mapper
 from src.imputation.impute_civ_def import impute_civil_defence
 from src.imputation import expansion_imputation as ximp
 
@@ -425,7 +425,6 @@ def calculate_totals(df):
 
 def run_longform_tmi(
     longform_df: pd.DataFrame,
-    sic_mapper: pd.DataFrame,
     config: Dict[str, Any],
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Function to run imputation end to end and returns the final
@@ -434,7 +433,6 @@ def run_longform_tmi(
     Args:
         longform_df (pd.DataFrame): the dataset filtered for long form entries
         target_variables (list): key variables
-        sic_mapper (pd.DataFrame): dataframe with sic mapper info
         config (Dict): the configuration settings
     Returns:
         final_df: dataframe with the imputed valued added
@@ -442,10 +440,7 @@ def run_longform_tmi(
         qa_df: qa dataframe
     """
     TMILogger.info("Starting TMI long form imputation.")
-
-    # TMI Step 1: impute the Product Group
-    df = impute_pg_by_sic(longform_df, sic_mapper)
-
+    df = longform_df.copy()
     # TMI Step 2: impute for R&D type (civil or defence)
     df = impute_civil_defence(df)
 
@@ -520,7 +515,6 @@ def run_shortform_tmi(
 
 def run_tmi(
     full_df: pd.DataFrame,
-    sic_mapper: pd.DataFrame,
     config: Dict[str, Any],
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Function to run imputation end to end and returns the final
@@ -528,7 +522,6 @@ def run_tmi(
         dataframe back to the pipeline
     Args:
         full_df (pd.DataFrame): the full responses spp dataframe
-        sic_mapper (pd.DataFrame): dataframe with sic to product group mapper info
         config (Dict): the configuration settings
     Returns:
         final_df(pd.DataFrame): dataframe with the imputed valued added and counts columns
@@ -553,7 +546,7 @@ def run_tmi(
     excluded_df = full_df.copy().loc[mor_mask]
 
     # apply TMI imputation to long forms and then short forms
-    longform_tmi_df, qa_df_long = run_longform_tmi(longform_df, sic_mapper, config)
+    longform_tmi_df, qa_df_long = run_longform_tmi(longform_df, config)
 
     shortform_tmi_df, qa_df_short = run_shortform_tmi(shortform_df, config)
 
diff --git a/src/outputs/form_output_prep.py b/src/outputs/form_output_prep.py
index 7b2401562..4ac885b41 100644
--- a/src/outputs/form_output_prep.py
+++ b/src/outputs/form_output_prep.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from src.staging.pg_conversion import sic_to_pg_mapper
+from src.imputation.pg_conversion import run_pg_conversion
 from src.staging.validation import flag_no_rand_spenders
 
 
@@ -8,7 +8,7 @@ def form_output_prep(
     weighted_df: pd.DataFrame,
     ni_full_responses: pd.DataFrame,
     pg_num_alpha: pd.DataFrame,
-    sic_pg_alpha: pd.DataFrame,
+    sic_pg_num: pd.DataFrame,
 ):
 
     """Prepares the data for the outputs.
@@ -61,7 +61,11 @@ def form_output_prep(
         ni_full_responses["formtype"] = "0003"
 
         # Update column 201 (currently PG numeric) to alpha-numeric, mapping from SIC.
-        ni_full_responses = sic_to_pg_mapper(ni_full_responses, sic_pg_alpha)
+        ni_full_responses = run_pg_conversion(
+            ni_full_responses, 
+            pg_num_alpha, 
+            sic_pg_num
+        )
 
         # outputs_df = pd.concat([outputs_df, ni_full_responses])
         tau_outputs_df = pd.concat([tau_outputs_df, ni_full_responses])
diff --git a/src/outputs/ni_sas.py b/src/outputs/ni_sas.py
index b9ea85285..538dcf9f7 100644
--- a/src/outputs/ni_sas.py
+++ b/src/outputs/ni_sas.py
@@ -6,7 +6,7 @@
 import src.outputs.map_output_cols as map_o
 from src.staging.validation import load_schema
 from src.outputs.outputs_helpers import create_output_df
-from src.staging.pg_conversion import sic_to_pg_mapper
+from src.imputation.pg_conversion import sic_to_pg_mapper
 
 OutputMainLogger = logging.getLogger(__name__)
 
diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py
index 161c0be35..1de77450b 100644
--- a/src/outputs/outputs_main.py
+++ b/src/outputs/outputs_main.py
@@ -58,7 +58,7 @@ def run_outputs(
         civil_defence_detailed (pd.DataFrame): Detailed descriptons of civil/defence
         sic_division_detailed (pd.DataFrame): Detailed descriptons of SIC divisions
         pg_num_alpha (pd.DataFrame): Mapper for product group conversions (num to alpha)
-        sic_pg_alpha (pd.DataFrame): Mapper for product group conversions (SIC to alpha)
+        sic_pg_num (pd.DataFrame): Mapper for product group conversions 
     """
 
     (
@@ -71,7 +71,7 @@ def run_outputs(
         weighted_df,
         ni_full_responses,
         pg_num_alpha,
-        sic_pg_alpha,
+        sic_pg_num,
     )
 
     # Running status filtered full dataframe output for QA
diff --git a/src/outputs/tau.py b/src/outputs/tau.py
index 329ab32d3..02e7ed11b 100644
--- a/src/outputs/tau.py
+++ b/src/outputs/tau.py
@@ -6,7 +6,6 @@
 import src.outputs.map_output_cols as map_o
 from src.staging.validation import load_schema
 from src.outputs.outputs_helpers import create_output_df
-from src.staging.pg_conversion import sic_to_pg_mapper
 
 OutputMainLogger = logging.getLogger(__name__)
 
diff --git a/src/pipeline.py b/src/pipeline.py
index 81ded7174..83f9cccad 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -137,7 +137,8 @@ def run_pipeline(start, config_path):
     imputed_df = run_imputation(
         full_responses,
         manual_trimming_df,
-        sic_pg_alpha,
+        pg_num_alpha,
+        sic_pg_num,
         backdata,
         config,
         write_csv,
@@ -196,7 +197,7 @@ def run_pipeline(start, config_path):
         civil_defence_detailed,
         sic_division_detailed,
         pg_num_alpha,
-        sic_pg_alpha,
+        sic_pg_num,
     )
 
     MainLogger.info("Finished All Output modules.")
diff --git a/tests/test_staging/test_pg_conversion.py b/tests/test_staging/test_pg_conversion.py
index 0fa74af0c..d39418fd7 100644
--- a/tests/test_staging/test_pg_conversion.py
+++ b/tests/test_staging/test_pg_conversion.py
@@ -4,7 +4,7 @@
 import pytest
 import numpy as np
 
-from src.staging.pg_conversion import pg_to_pg_mapper, sic_to_pg_mapper
+from src.imputation.pg_conversion import pg_to_pg_mapper, sic_to_pg_mapper
 
 
 @pytest.fixture

From 8b0176accc500d0174923aa2face4ffb90eb350e Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 10:09:19 +0000
Subject: [PATCH 16/26] 648 minor changes

---
 src/imputation/pg_conversion.py | 15 ++++++++-------
 src/staging/staging_main.py     | 15 ---------------
 2 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/src/imputation/pg_conversion.py b/src/imputation/pg_conversion.py
index 4649096a9..76bcf3dd1 100644
--- a/src/imputation/pg_conversion.py
+++ b/src/imputation/pg_conversion.py
@@ -73,8 +73,10 @@ def pg_to_pg_mapper(
 ):
     """Map from PG numeric to PG alpha-numeric and create a new column.
 
-    The product group column (default: column 201) coped to a new column, "pg_numeric",
-    and then is updated from numeric to alpha-numeric using a mapping.
+    The mapper used is from a file named pg_num_alpha.csv
+
+    The product group column (default: column 201) is copied to a new column, 
+    "pg_numeric", and then the original column is mapped from numeric to alpha-numeric.
 
     Example initial dataframe:
         reference | 201     
@@ -137,18 +139,17 @@ def run_pg_conversion(
     sic_pg_num: pd.DataFrame,
     pg_column: str = "201",
 ):
-    """Run the product group mapping functions and return a
-    dataframe with the correct mapping for each formtype.
+    """Run the product group (PG) mapping functions.
 
     Args:
         df (pd.DataFrame): Dataframe of full responses data
-        mapper (pd.DataFrame): The mapper file used for PG conversion
-        pg_column: The original product group column
+        pg_num_alpha (pd.DataFrame): Mapper from numeric to alpha-numeric PG.
+        pg_column: The original product group column, default 201
 
     Returns:
         (pd.DataFrame): Dataframe with mapped values
     """
-    # Where the
+    # Where product group is null, map it from SIC.
     df = sic_to_pg_mapper(df, sic_pg_num, pg_column)
 
     # PG numeric to alpha_numeric mapping for long forms
diff --git a/src/staging/staging_main.py b/src/staging/staging_main.py
index 6d072a475..383c18d14 100644
--- a/src/staging/staging_main.py
+++ b/src/staging/staging_main.py
@@ -8,7 +8,6 @@
 
 # Our own modules
 from src.staging import validation as val
-from src.staging import pg_conversion as pg
 import src.staging.staging_helpers as helpers
 
 
@@ -210,15 +209,6 @@ def run_staging(
         #     backdata_path, "./config/backdata_schema.toml"
         # )
 
-        # Fix for different column names on network vs hdfs
-        if network_or_hdfs == "network":
-            # Map PG numeric to alpha in column q201
-            # This isn't done on HDFS as the column is already mapped
-            backdata = pg.pg_to_pg_mapper(
-                backdata,
-                pg_num_alpha,
-                pg_column="q201",
-            )
         StagingMainLogger.info("Backdata File Loaded Successfully...")
     else:
         backdata = None
@@ -286,11 +276,6 @@ def run_staging(
     mapper_path = paths["mapper_path"]
     write_csv(f"{mapper_path}/sic_pg_num.csv", sic_pg_utf_mapper)
 
-    # Map PG from SIC/PG numbers to column '201'.
-    full_responses = pg.run_pg_conversion(
-        full_responses, pg_num_alpha, sic_pg_alpha_mapper, target_col="201"
-    )
-
     pg_detailed_mapper = helpers.load_valdiate_mapper(
         "pg_detailed_mapper_path",
         paths,

From eb637e2ad47cd176db2341e2de405e0fcb94a2c2 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 10:42:33 +0000
Subject: [PATCH 17/26] add exception if mapper not working

---
 src/imputation/pg_conversion.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/imputation/pg_conversion.py b/src/imputation/pg_conversion.py
index 76bcf3dd1..fa6d0556b 100644
--- a/src/imputation/pg_conversion.py
+++ b/src/imputation/pg_conversion.py
@@ -54,6 +54,8 @@ def sic_to_pg_mapper(
         PgLogger.error(
             f"Mapping doesnt exist for the following SIC numbers: {mapless_errors}"
         )
+        raise Exception("Errors in the SIC to PG numeric mapper.")
+    
     # Map to the target column using the dictionary, null values only
     df.loc[df[pg_column].isnull(), pg_column] = (
         df.loc[df[pg_column].isnull(), sic_column].map(map_dict)
@@ -122,6 +124,7 @@ def pg_to_pg_mapper(
         PgLogger.error(
             f"Mapping doesnt exist for the following product groups: {mapless_errors}"
         )
+        raise Exception("Errors in the PG numeric to alpha-numeric mapper.")
 
     df[pg_column] = df[pg_column].map(map_dict)
 

From 58e7e578e8f9c51c78c7637c5dbe14a17c6609cf Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 10:46:39 +0000
Subject: [PATCH 18/26] remove duplicate line from config

---
 src/developer_config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index 7408666a9..a771aade6 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -27,7 +27,6 @@ global:
   output_auto_outliers: False
   output_outlier_qa : False
   output_estimation_qa: False
-  output_imputation_qa: False
   output_apportionment_qa: False
   output_long_form: False
   output_short_form: False

From 86a91e3fceed8aa9bb6ad00fa46dd97ccb8acc1a Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 11:39:45 +0000
Subject: [PATCH 19/26] remove unnecessary pg conversion from NI sas

---
 src/outputs/ni_sas.py       | 16 +---------------
 src/outputs/outputs_main.py |  2 --
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/outputs/ni_sas.py b/src/outputs/ni_sas.py
index 538dcf9f7..717f76854 100644
--- a/src/outputs/ni_sas.py
+++ b/src/outputs/ni_sas.py
@@ -6,7 +6,7 @@
 import src.outputs.map_output_cols as map_o
 from src.staging.validation import load_schema
 from src.outputs.outputs_helpers import create_output_df
-from src.imputation.pg_conversion import sic_to_pg_mapper
+from src.imputation.pg_conversion import run_pg_conversion
 
 OutputMainLogger = logging.getLogger(__name__)
 
@@ -16,8 +16,6 @@ def output_ni_sas(
     config: Dict[str, Any],
     write_csv: Callable,
     run_id: int,
-    sic_pg_num: pd.DataFrame,
-    postcode_itl_mapper: pd.DataFrame,
 ):
 
     """Run the outputs module.
@@ -39,18 +37,6 @@ def output_ni_sas(
     paths = config[f"{NETWORK_OR_HDFS}_paths"]
     output_path = paths["output_path"]
 
-    # Prepare the columns needed for outputs:
-
-    # Fill in numeric PG where missing
-    df = sic_to_pg_mapper(
-        df,
-        sic_pg_num,
-        target_col="pg_numeric",
-        from_col="SIC 2007_CODE",
-        to_col="2016 > Form PG",
-        formtype=["0003"],
-    )
-
     # Map the sizebands based on frozen employment
     df = map_o.map_sizebands(df)
 
diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py
index 1de77450b..5bc3556fe 100644
--- a/src/outputs/outputs_main.py
+++ b/src/outputs/outputs_main.py
@@ -147,8 +147,6 @@ def run_outputs(
             config,
             write_csv,
             run_id,
-            sic_pg_num,
-            postcode_mapper,
         )
         OutputMainLogger.info("Finished NI SAS output.")
 

From 2309d54ed28057887b97f237eb1e283ad957f07c Mon Sep 17 00:00:00 2001
From: George Zorinyants <george.zorinyants@ons.gov.uk>
Date: Mon, 15 Jan 2024 11:58:40 +0000
Subject: [PATCH 20/26] Postcode top up returns an empty string when the
 postcode is empty

---
 src/developer_config.yaml      | 4 ++--
 src/staging/staging_helpers.py | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index e07316c20..90ed0eb16 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -9,7 +9,7 @@ global:
   # Staging and validation settings
   postcode_csv_check: False
   load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
-  load_ni_data: True
+  load_ni_data: False
   load_historic_data: False
   run_construction: False
   run_ni_construction: True
@@ -38,7 +38,7 @@ global:
   output_intram_by_itl1: False
   output_intram_by_civil_defence: False
   output_intram_by_sic: False
-  output_status_filtered: False
+  output_status_filtered: True
   output_fte_total_qa: False
 years:
   current_year: 2022 # TODO: put this in the userconfig
diff --git a/src/staging/staging_helpers.py b/src/staging/staging_helpers.py
index 0cb87b58e..4d53efa52 100644
--- a/src/staging/staging_helpers.py
+++ b/src/staging/staging_helpers.py
@@ -33,8 +33,7 @@ def postcode_topup(mystr: str, target_len: int = 8) -> str:
     spaces and cuts the tail on the right.
     If there is only one part, keeps the first 8 characters and tops it up with
     spaces on the right if needed.
-    Empty input string would have zero parts and will return a string of
-    eight spaces.
+    Empty input string would have zero parts and will return an empty string.
 
     Args:
         mystr (str): Input postcode.
@@ -69,7 +68,7 @@ def postcode_topup(mystr: str, target_len: int = 8) -> str:
                 return (part1 + part2)[:target_len]
 
         else:
-            return mystr[:target_len].ljust(target_len, " ")
+            return ""
 
 
 def fix_anon_data(responses_df, config):

From f89e860395c443cee517fdb8bd6eb2faf8e43cf2 Mon Sep 17 00:00:00 2001
From: Cheshire <Jen.Cheshire@ons.gov.uk>
Date: Mon, 15 Jan 2024 13:53:54 +0000
Subject: [PATCH 21/26] RDRP-646: move short to long to only run on GB

---
 src/construction/construction.py | 8 ++++----
 src/developer_config.yaml        | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/construction/construction.py b/src/construction/construction.py
index 7daf927ac..939899bd4 100644
--- a/src/construction/construction.py
+++ b/src/construction/construction.py
@@ -80,13 +80,13 @@ def run_construction(
     # Add flags to indicate whether a row was constructed or should be imputed
     updated_snapshot_df["is_constructed"] = False
     updated_snapshot_df["force_imputation"] = False
-
-    # Prepare the short to long form constructions
-    updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df)
     construction_df["is_constructed"] = True
 
-    # Create period_year column, except for NI which already has it
+    # Run GB specific actions
     if not is_northern_ireland:
+        # Prepare the short to long form constructions (N/A to NI)
+        updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df)
+        # Create period_year column (NI already has it)
         updated_snapshot_df = create_period_year(updated_snapshot_df)
         construction_df = create_period_year(construction_df)
 
diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index d34950dbd..066c5f58f 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -11,7 +11,7 @@ global:
   load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
   load_ni_data: False
   load_historic_data: False
-  run_construction: True
+  run_construction: False
   run_ni_construction: False
   load_manual_outliers: False
   load_manual_imputation: False

From 079b8192553efb7b00acd1adbef00475c6624155 Mon Sep 17 00:00:00 2001
From: Cheshire <Jen.Cheshire@ons.gov.uk>
Date: Mon, 15 Jan 2024 14:34:44 +0000
Subject: [PATCH 22/26] RDRP-646: moved postcode function to GB only

---
 src/construction/construction.py | 33 ++++++++++++++++----------------
 src/developer_config.yaml        |  3 ++-
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/construction/construction.py b/src/construction/construction.py
index c8aae69da..126e04a34 100644
--- a/src/construction/construction.py
+++ b/src/construction/construction.py
@@ -90,10 +90,9 @@ def run_construction(
         # Create period_year column (NI already has it)
         updated_snapshot_df = create_period_year(updated_snapshot_df)
         construction_df = create_period_year(construction_df)
-
-    # Set instance=1 so longforms with status 'Form sent out' match correctly
-    form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out")
-    updated_snapshot_df.loc[form_sent_condition, "instance"] = 1
+        # Set instance=1 so longforms with status 'Form sent out' match correctly
+        form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out")
+        updated_snapshot_df.loc[form_sent_condition, "instance"] = 1
 
     # NI data has no instance but needs an instance of 1
     if is_northern_ireland:
@@ -123,18 +122,20 @@ def run_construction(
         {"reference": "Int64", "instance": "Int64", "period_year": "Int64"}
     )
 
-    # Long form records with a postcode in 601 use this as the postcode 
-    long_form_cond = (~updated_snapshot_df["601"].isnull())
-    updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"]
-
-    # Short form records with nothing in 601 use referencepostcode instead
-    short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull())
-    updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"]
-
-    # Top up all new postcodes so they're all eight characters exactly
-    postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"]
-    for col in postcode_cols:
-        updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup)
+    # Run GB specific actions
+    if not is_northern_ireland:
+        # Long form records with a postcode in 601 use this as the postcode
+        long_form_cond = (~updated_snapshot_df["601"].isnull())
+        updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"]
+
+        # Short form records with nothing in 601 use referencepostcode instead
+        short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull())
+        updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"]
+
+        # Top up all new postcodes so they're all eight characters exactly
+        postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"]
+        for col in postcode_cols:
+            updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup)
 
     updated_snapshot_df = updated_snapshot_df.sort_values(
         ["reference", "instance"], ascending=[True, True]
diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index 8c1ddee7f..dc4df2a4a 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -27,6 +27,7 @@ global:
   output_outlier_qa : False
   output_estimation_qa: False
   output_apportionment_qa: False
+  output_imputation_qa: False
   output_long_form: False
   output_short_form: False
   output_gb_sas: False
@@ -36,7 +37,7 @@ global:
   output_intram_by_itl1: False
   output_intram_by_civil_defence: False
   output_intram_by_sic: False
-  output_status_filtered: True
+  output_status_filtered: False
   output_fte_total_qa: False
 years:
   current_year: 2022 # TODO: put this in the userconfig

From faac8f5ccda56e797e0b7954f6de590c5aec7e91 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 14:51:55 +0000
Subject: [PATCH 23/26] correct previous merge error in validation

---
 src/developer_config.yaml |  6 +--
 src/staging/validation.py | 77 ++++++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index 20ed94430..60aa0136b 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -12,7 +12,7 @@ global:
   load_ni_data: False
   load_historic_data: False
   run_construction: False
-  run_ni_construction: True
+  run_ni_construction: False
   load_manual_outliers: False
   load_manual_imputation: False
   load_backdata: True  # whether to load previous year data for MoR
@@ -26,8 +26,8 @@ global:
   output_imputation_qa: False
   output_auto_outliers: False
   output_outlier_qa : False
-  output_estimation_qa: False
-  output_apportionment_qa: False
+  output_estimation_qa: True
+  output_apportionment_qa: True
   output_long_form: False
   output_short_form: False
   output_gb_sas: False
diff --git a/src/staging/validation.py b/src/staging/validation.py
index 417a28116..ca0274209 100644
--- a/src/staging/validation.py
+++ b/src/staging/validation.py
@@ -334,6 +334,9 @@ def load_schema(file_path: str = "./config/contributors_schema.toml") -> dict:
         toml_dict = toml.load(file_path)
     else:
         # Return False if file does not exist
+        ValidationLogger.warning(
+            "Validation schema does not exist! Path may be incorrect"
+        )
         return file_exists
 
     return toml_dict
@@ -416,6 +419,9 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str):
     # Load schema from toml
     dtypes_schema = load_schema(schema_path)
 
+    if not dtypes_schema:
+        raise FileNotFoundError(f"File at {schema_path} does not exist. Check path")
+
     # Create a dict for dtypes only
     dtypes_dict = {
         column_nm: dtypes_schema[column_nm]["Deduced_Data_Type"]
@@ -442,6 +448,15 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str):
                 survey_df[column] = survey_df[column].astype(pd.Int64Dtype())
             elif dtypes_dict[column] == "str":
                 survey_df[column] = survey_df[column].astype("string")
+            elif "datetime" in dtypes_dict[column]:
+                try:
+                    survey_df[column] = pd.to_datetime(
+                        survey_df[column], errors="coerce"
+                    )
+                except TypeError:
+                    raise TypeError(
+                        f"Failed to convert column '{column}' to datetime. Please check the data."
+                    )
             else:
                 survey_df[column] = survey_df[column].astype(dtypes_dict[column])
             ValidationLogger.debug(f"{column} after: {survey_df[column].dtype}")
@@ -551,22 +566,31 @@ def check_ultfoc(value):
 
 @time_logger_wrap
 @exception_wrap
-def validate_many_to_one(
-    mapper: pd.DataFrame, col_many: str, col_one: str
-) -> pd.DataFrame:
+def validate_many_to_one(*args) -> pd.DataFrame:
     """
+    Validates a many-to-one mapper DataFrame.
 
-    Validates a many to one mapper:
-    1. Checks if the mapper has two columns col_many and col_one.
-    2. Salects and deduplicates col_many and col_one.
-    3. Checks that for each entry in col_many there is exactly one entry in
-    col_one.
+    This function performs the following checks:
+    1. Checks if the mapper has two specified columns, referred to as 'col_many' and 'col_one'.
+    2. Selects and deduplicates 'col_many' and 'col_one'.
+    3. Checks that for each entry in 'col_many' there is exactly one corresponding entry in 'col_one'.
 
     Args:
-        df (pd.DataFrame): The input mapper
-        col_many (str): name of the column with many entries
-        col_one (str): name of the column with one entry
+        *args: Variable length argument list. It should contain the following items in order:
+            - df (pd.DataFrame): The input mapper DataFrame.
+            - col_many (str): The name of the column with many entries.
+            - col_one (str): The name of the column with one entry.
+
+    Returns:
+        pd.DataFrame: The validated mapper DataFrame with deduplicated 'col_many' and 'col_one' columns.
+
+    Raises:
+        ValueError: If the mapper does not have the 'col_many' and 'col_one' columns, or if there are multiple entries in 'col_one' for any entry in 'col_many'.
     """
+
+    mapper = args[0]
+    col_many = args[1]
+    col_one = args[2]
     try:
         # Check that expected column are present
         cols = mapper.columns
@@ -588,7 +612,7 @@ def validate_many_to_one(
             ValidationLogger.info(
                 "The following codes have multile mapping: \n {df_bad}"
             )
-            raise ValueError(f"Mapper is many to many")
+            raise ValueError("Mapper is many to many")
         return df
 
     except ValueError as ve:
@@ -625,7 +649,7 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame:
         df["contents_check"] = status_check & from_status_check
 
         # Check if there are any False values in the "contents_check" column
-        if (df["contents_check"] == False).any():
+        if (df["contents_check"] == False).any():  # noqa
             raise ValueError("Unexpected format within column contents")
 
         # Drop the "contents_check" column
@@ -635,3 +659,30 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame:
 
     except ValueError as ve:
         raise ValueError("cora status mapper validation failed: " + str(ve))
+
+
+def flag_no_rand_spenders(df, raise_or_warn):
+    """
+    Flags any records that answer "No" to "604" and also report their expenditure in "211" as more than 0.
+
+    Parameters:
+    df (pandas.DataFrame): The input DataFrame.
+
+    Returns:
+        None
+    """
+    invalid_records = df.loc[(df["604"] == "No") & (df["211"] > 0)]
+
+    if not invalid_records.empty:
+        if raise_or_warn == "raise":
+            raise Exception("Some records report no R&D, but spend in 211 > 0.")
+        elif raise_or_warn == "warn":
+            total_invalid_spend = invalid_records["211"].sum()
+            ValidationLogger.error("Some records report no R&D, but spend in 211 > 0.")
+            ValidationLogger.error(
+                f"The total spend of 'No' R&D companies is £{int(total_invalid_spend)}"
+            )
+            ValidationLogger.error(invalid_records)
+
+    else:
+        ValidationLogger.debug("All records have valid R&D spend.")

From dd4092e63e64ee669f731f59e3053f1713daa567 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 16:23:00 +0000
Subject: [PATCH 24/26] fill nulls in MoR and CF

---
 src/_version.py                   |  2 +-
 src/developer_config.yaml         |  4 ++--
 src/imputation/MoR.py             | 12 +++++++-----
 src/imputation/imputation_main.py |  7 ++++---
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/_version.py b/src/_version.py
index 43c4ab005..22049ab2c 100644
--- a/src/_version.py
+++ b/src/_version.py
@@ -1 +1 @@
-__version__ = "0.6.1"
+__version__ = "0.6.2"
diff --git a/src/developer_config.yaml b/src/developer_config.yaml
index 2b4190ede..bc1052920 100644
--- a/src/developer_config.yaml
+++ b/src/developer_config.yaml
@@ -11,7 +11,7 @@ global:
   load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
   load_ni_data: False
   load_historic_data: False
-  run_construction: False
+  run_construction: True
   run_ni_construction: False
   load_manual_outliers: False
   load_manual_imputation: False
@@ -87,7 +87,7 @@ network_paths:
   backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv"
   outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers"
   manual_outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
-  construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/JC_test_construction_file.csv"
+  construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_file.csv"
   construction_file_path_ni: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_ni_file.csv"
   # construction_add_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_additions_2023-11-06_v5.csv" # TODO Need to test
   # construction_amend_path:  "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_amendments_2023-10-31_v2.csv" # TODO Need to test
diff --git a/src/imputation/MoR.py b/src/imputation/MoR.py
index 3878a46d4..e4d6ad82f 100644
--- a/src/imputation/MoR.py
+++ b/src/imputation/MoR.py
@@ -124,15 +124,17 @@ def carry_forwards(df, backdata, impute_vars):
     # Copy values from relevant columns where references match
     match_cond = df["_merge"] == "both"
 
-    # replace the values of certain columns with the values from the back data
-    # TODO: Check with methodology or BAU as to which other cols to take from backdata
-    # TODO: By default, columns not updated such as 4xx, 5xx will contain the current
-    # data, instance 0.
+    # Replace the values of certain columns with the values from the back data
     replace_vars = ["instance", "200", "201", "601", "602", "604"]
     for var in replace_vars:
         df.loc[match_cond, var] = df.loc[match_cond, f"{var}_prev"]
+    
+    # Update the varibles to be imputed by the corresponding previous values, filling 
+    # nulls with zeros.
     for var in impute_vars:
-        df.loc[match_cond, f"{var}_imputed"] = df.loc[match_cond, f"{var}_prev"]
+        df.loc[match_cond, f"{var}_imputed"] = df.loc[
+            match_cond, f"{var}_prev"
+        ].fillna(0)
     df.loc[match_cond, "imp_marker"] = "CF"
 
     df.loc[match_cond] = create_imp_class_col(df, "200_prev", "201_prev")
diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py
index 64e9e6ca1..0dc271a2f 100644
--- a/src/imputation/imputation_main.py
+++ b/src/imputation/imputation_main.py
@@ -125,13 +125,14 @@ def run_imputation(
     # Changing all records that meet the criteria to "604" == "Yes"
     imputed_df.loc[(chk_mask & imputation_mask), "604"] = "Yes"
 
-    # Run short form expansion
-    imputed_df = run_sf_expansion(imputed_df, config)
-
     # join constructed rows back to the imputed df
+    # Note that constructed rows need to be included in short form expansion
     if "is_constructed" in df.columns:
         imputed_df = pd.concat([imputed_df, constructed_df])
 
+    # Run short form expansion
+    imputed_df = run_sf_expansion(imputed_df, config)
+
     # join manually trimmed columns back to the imputed df
     if not trimmed_df.empty:
         imputed_df = pd.concat([imputed_df, trimmed_df])

From 8968c43e827c1187817c208c329f9b18fd42c047 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Mon, 15 Jan 2024 17:36:56 +0000
Subject: [PATCH 25/26] 654 bugfix in progress

---
 src/imputation/MoR.py          |  3 +++
 src/imputation/sf_expansion.py | 35 +++++++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/imputation/MoR.py b/src/imputation/MoR.py
index e4d6ad82f..8b5c3c9ef 100644
--- a/src/imputation/MoR.py
+++ b/src/imputation/MoR.py
@@ -10,6 +10,7 @@
     calculate_totals,
 )
 
+
 good_statuses = ["Clear", "Clear - overridden"]
 bad_statuses = ["Form sent out", "Check needed"]
 
@@ -112,6 +113,8 @@ def carry_forwards(df, backdata, impute_vars):
     df = pd.merge(
         df, backdata, how="left", on="reference", suffixes=("", "_prev"), indicator=True
     )
+    # ensure the instance columns are still type "int" after merge
+    df = df.astype({"instance": "Int64", "instance_prev": "Int64"})
 
     # keep only the rows needed, see function docstring for details.
     no_match_cond = df["_merge"] == "left_only"
diff --git a/src/imputation/sf_expansion.py b/src/imputation/sf_expansion.py
index 025fe43d2..82b534519 100644
--- a/src/imputation/sf_expansion.py
+++ b/src/imputation/sf_expansion.py
@@ -165,19 +165,44 @@ def apply_expansion(
     return expanded_df
 
 
+def prepare_short_form_constructed(df: pd.DataFrame, master_cols: List) -> pd.DataFrame:
+    """Prepare the constructed short form responses for sf expansion.
+    
+    The constructed records were removed from imputation, so it is necessary to copy
+    the master columns to the empty "imputed" master columns.
+    It is also necessary to create imputation classes for these records.
+
+    For example, column "211" needs to be copied to "211_imputed" in these cases.
+    """
+    sf_constructed_mask = (df.formtype == "0006") & (df.imp_marker == "constructed")
+
+    # Create imputation class for the short
+    df.loc[sf_constructed_mask, "imp_class"] = (
+        df.loc[sf_constructed_mask, "200"] + df.loc[sf_constructed_mask, "201"]
+    )
+
+    # Copy the values of the master columns to the corresponding "_imputed" column
+    for col in master_cols:
+        df.loc[sf_constructed_mask, f"{col}_imputed"] = df.loc[sf_constructed_mask, col]
+    
+    return df
+
+
 @df_change_func_wrap
 def run_sf_expansion(df: pd.DataFrame, config: dict) -> pd.DataFrame:
     """Calculate the expansion imputated values for short forms using long form data."""
-
-    # Remove records that have the reference list variables
-    # and those that have "nan" in the imp class
-    filtered_df, excluded_df = split_df_on_imp_class(df)
-
     # Get dictionary of short form master keys (or target variables)
     # and breakdown variables
     breakdown_dict = config["breakdowns"]
     master_values = list(breakdown_dict)
 
+    # Prepare constructed short-form entries for sf expansion imputation
+    df = prepare_short_form_constructed(df, master_values)
+
+    # Remove records that have the reference list variables
+    # and those that have "nan" in the imp class
+    filtered_df, excluded_df = split_df_on_imp_class(df)
+
     # Obtain the "threshold_num" from the config
     # (this is the minimum viable number in an imputation class)
     threshold_num = config["imputation"]["sf_expansion_threshold"]

From e5b0d9ec14647966d3d13365a0de20cc7efea1c4 Mon Sep 17 00:00:00 2001
From: Griffith <anne.griffith@ons.gov.uk>
Date: Tue, 16 Jan 2024 08:54:00 +0000
Subject: [PATCH 26/26] 654 add underscore to imputation class creation

---
 src/imputation/sf_expansion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imputation/sf_expansion.py b/src/imputation/sf_expansion.py
index 82b534519..bfbe9efde 100644
--- a/src/imputation/sf_expansion.py
+++ b/src/imputation/sf_expansion.py
@@ -178,7 +178,7 @@ def prepare_short_form_constructed(df: pd.DataFrame, master_cols: List) -> pd.Da
 
     # Create imputation class for the short
     df.loc[sf_constructed_mask, "imp_class"] = (
-        df.loc[sf_constructed_mask, "200"] + df.loc[sf_constructed_mask, "201"]
+        df.loc[sf_constructed_mask, "200"] + "_" + df.loc[sf_constructed_mask, "201"]
     )
 
     # Copy the values of the master columns to the corresponding "_imputed" column