From c9bf7abe6a9eb0d367dd95be6cc9ce41e81f10df Mon Sep 17 00:00:00 2001 From: Tom Coates Date: Fri, 5 Jan 2024 15:40:27 +0000 Subject: [PATCH 01/26] fix inconsistency in naming of short/long form --- ...{frozen_longform_schema.toml => long_form_schema.toml} | 0 ...rozen_shortform_schema.toml => short_form_schema.toml} | 0 src/developer_config.yaml | 8 ++++---- src/outputs/long_form.py | 2 +- src/outputs/short_form.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) rename config/output_schemas/{frozen_longform_schema.toml => long_form_schema.toml} (100%) rename config/output_schemas/{frozen_shortform_schema.toml => short_form_schema.toml} (100%) diff --git a/config/output_schemas/frozen_longform_schema.toml b/config/output_schemas/long_form_schema.toml similarity index 100% rename from config/output_schemas/frozen_longform_schema.toml rename to config/output_schemas/long_form_schema.toml diff --git a/config/output_schemas/frozen_shortform_schema.toml b/config/output_schemas/short_form_schema.toml similarity index 100% rename from config/output_schemas/frozen_shortform_schema.toml rename to config/output_schemas/short_form_schema.toml diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 843c341d7..2c9677731 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -67,8 +67,8 @@ hdfs_paths: manual_imp_trim_path: "/ons/rdbe_dev/imputation/manual_trimming" outliers_path: "/ons/rdbe_dev/outliers" estimation_path: "/ons/rdbe_dev/estimation" - short_form_schema: "src/outputs/output_schemas/frozen_shortform_schema.toml" - long_form_schema: "src/outputs/output_schemas/frozen_longform_schema.toml" + short_form_schema: "src/outputs/output_schemas/short_form_schema.toml" + long_form_schema: "src/outputs/output_schemas/long_form_schema.toml" export_path: /ons/rdbe_dev/outgoing_export feather_path: "/ons/rdbe_dev/staging/feather" network_paths: @@ -113,8 +113,8 @@ network_paths: civil_defence_detailed_path: "R:/BERD Results System Development 2023/DAP_emulation/mappers/civil_defence_detailed.csv" sic_division_detailed_path: "R:/BERD Results System Development 2023/DAP_emulation/mappers/sic_div_detailed.csv" schema_paths: - frozen_shortform_schema: "config/output_schemas/frozen_shortform_schema.toml" - frozen_longform_schema: "config/output_schemas/frozen_longform_schema.toml" + short_form_schema: "config/output_schemas/short_form_schema.toml" + long_form_schema: "config/output_schemas/long_form_schema.toml" tau_schema: "config/output_schemas/tau_schema.toml" gb_sas_schema: "config/output_schemas/gb_sas_schema.toml" ni_sas_schema: "config/output_schemas/ni_sas_schema.toml" diff --git a/src/outputs/long_form.py b/src/outputs/long_form.py index bdc75bdfc..250e0ca2c 100644 --- a/src/outputs/long_form.py +++ b/src/outputs/long_form.py @@ -49,7 +49,7 @@ def output_long_form( df = map_o.join_fgn_ownership(df, ultfoc_mapper) # Create long form output dataframe with required columns from schema - schema_path = config["schema_paths"]["frozen_longform_schema"] + schema_path = config["schema_paths"]["long_form_schema"] schema_dict = load_schema(schema_path) longform_output = create_output_df(df, schema_dict) diff --git a/src/outputs/short_form.py b/src/outputs/short_form.py index eaf5516ef..57ad5c667 100644 --- a/src/outputs/short_form.py +++ b/src/outputs/short_form.py @@ -136,7 +136,7 @@ def output_short_form( df = run_shortform_prep(df, round_val=4) # Create short form output dataframe with required columns from schema - schema_path = config["schema_paths"]["frozen_shortform_schema"] + schema_path = config["schema_paths"]["short_form_schema"] schema_dict = load_schema(schema_path) shortform_output = create_output_df(df, schema_dict) From 93a1d89710c33a055f8aec56b68f1dc79f949f90 Mon Sep 17 00:00:00 2001 From: Griffith Date: Fri, 5 Jan 2024 17:46:05 +0000 Subject: [PATCH 02/26] script to create C:/Users/griffa1/Anaconda3/envs/resdev362/python.exe d:/coding_projects/research-and-development/unit_test_helper.py --- unit_test_helper.py | 60 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 unit_test_helper.py diff --git a/unit_test_helper.py b/unit_test_helper.py new file mode 100644 index 000000000..5e481a4bc --- /dev/null +++ b/unit_test_helper.py @@ -0,0 +1,60 @@ +"""Read in a csv file and ouput a test file with data for a unit test.""" +import pandas as pd +import os + +# configuration settings +csv_path = "D:/coding_projects/randd_test_data/" +input_file = "outlier_test1.csv" + +# whether the unit test data is input or expected output +in_or_output = "input" + +output_filename = f"new_{in_or_output}_function" + +# read in the csv +path1 = os.path.join(csv_path, input_file) +df1 = pd.read_csv(path1) + + +# set all datatypes to string - we are outputting all the data as a string +df1 = df1.astype(str) + +# add quotes to the strings in the columns that should show as string types +string_cols = ["period"] + +df1[string_cols] = df1[string_cols].applymap('"{}"'.format) + +# prepare the output formatting +tab = " "*4 + +col_list = df1.columns +col_string = "" + +# create a new column that joins the contents of the other columns +df1['output'] = f"{tab}[" +for col in df1.columns[:-1]: + df1["output"] += df1[col] + ", " + col_string += f'{tab}{tab}"{col}",\n' + +df1['output'] += df1[df1.columns[-2]] + "]," + +# concatenate everything in the new column into a single string +rows_string = df1["output"].str.cat(sep=f"\n{tab}") + +# join all the components into a final string for output +full_text = f'''def create_input_df(self): + """Create an input dataframe for the test.""" + {in_or_output}_columns = [\n{col_string}{tab}] + + data = [\n{tab}{rows_string}] + + {in_or_output}_df = pandasDF(data=data, columns={in_or_output}_columns) + return {in_or_output}_df + ''' + +# write the prepared text to a txt file +out_path = os.path.join(csv_path, output_filename + ".txt") + +text_file = open(out_path, "w") +text_file.write(full_text) +text_file.close() From b72d7ec8fde0c67c554528260435bb1f518338ff Mon Sep 17 00:00:00 2001 From: Griffith Date: Fri, 5 Jan 2024 17:48:34 +0000 Subject: [PATCH 03/26] script to create unit test dataframe from csv --- unit_test_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test_helper.py b/unit_test_helper.py index 5e481a4bc..fccfc59c7 100644 --- a/unit_test_helper.py +++ b/unit_test_helper.py @@ -9,7 +9,7 @@ # whether the unit test data is input or expected output in_or_output = "input" -output_filename = f"new_{in_or_output}_function" +output_filename = f"{in_or_output}_function" # read in the csv path1 = os.path.join(csv_path, input_file) @@ -46,7 +46,7 @@ """Create an input dataframe for the test.""" {in_or_output}_columns = [\n{col_string}{tab}] - data = [\n{tab}{rows_string}] + data = [\n{tab}{rows_string}\n{tab}] {in_or_output}_df = pandasDF(data=data, columns={in_or_output}_columns) return {in_or_output}_df From 43f2841c85173f180516a807c30f27b26e6beb9b Mon Sep 17 00:00:00 2001 From: George Zorinyants Date: Tue, 9 Jan 2024 10:05:09 +0000 Subject: [PATCH 04/26] Changed postcode column from postcodes_harmonised to 601 --- src/site_apportionment/site_apportionment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/site_apportionment/site_apportionment.py b/src/site_apportionment/site_apportionment.py index 2b5bec401..8fff9f701 100644 --- a/src/site_apportionment/site_apportionment.py +++ b/src/site_apportionment/site_apportionment.py @@ -12,7 +12,7 @@ ins = "instance" period = "period" form = "formtype" -postcode = "postcodes_harmonised" +postcode = "601" # "postcodes_harmonised" percent = "602" product = "201" pg_num = "pg_numeric" From 6b57fba3453f279bd96318ad939c6df8e02a04d3 Mon Sep 17 00:00:00 2001 From: Griffith Date: Tue, 9 Jan 2024 10:15:37 +0000 Subject: [PATCH 05/26] remove duplicate output_imputation --- src/developer_config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index e07316c20..6432ea285 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -27,7 +27,6 @@ global: output_auto_outliers: False output_outlier_qa : False output_estimation_qa: False - output_imputation_qa: False output_apportionment_qa: False output_long_form: False output_short_form: False From 8dc14abbfdc4859865591483a4489edd0c5a6093 Mon Sep 17 00:00:00 2001 From: George Zorinyants Date: Tue, 9 Jan 2024 14:25:18 +0000 Subject: [PATCH 06/26] Postcode topup is applied to 601 in validation --- src/staging/validation.py | 78 +++++++-------------------------------- 1 file changed, 14 insertions(+), 64 deletions(-) diff --git a/src/staging/validation.py b/src/staging/validation.py index 8b257cc69..417a28116 100644 --- a/src/staging/validation.py +++ b/src/staging/validation.py @@ -185,6 +185,7 @@ def validate_post_col( ) df["postcodes_harmonised"] = df["postcodes_harmonised"].apply(postcode_topup) + df["601"] = df["601"].apply(postcode_topup) ValidationLogger.info("All postcodes validated....") @@ -333,9 +334,6 @@ def load_schema(file_path: str = "./config/contributors_schema.toml") -> dict: toml_dict = toml.load(file_path) else: # Return False if file does not exist - ValidationLogger.warning( - "Validation schema does not exist! Path may be incorrect" - ) return file_exists return toml_dict @@ -418,9 +416,6 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): # Load schema from toml dtypes_schema = load_schema(schema_path) - if not dtypes_schema: - raise FileNotFoundError(f"File at {schema_path} does not exist. Check path") - # Create a dict for dtypes only dtypes_dict = { column_nm: dtypes_schema[column_nm]["Deduced_Data_Type"] @@ -447,15 +442,6 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): survey_df[column] = survey_df[column].astype(pd.Int64Dtype()) elif dtypes_dict[column] == "str": survey_df[column] = survey_df[column].astype("string") - elif "datetime" in dtypes_dict[column]: - try: - survey_df[column] = pd.to_datetime( - survey_df[column], errors="coerce" - ) - except TypeError: - raise TypeError( - f"Failed to convert column '{column}' to datetime. Please check the data." - ) else: survey_df[column] = survey_df[column].astype(dtypes_dict[column]) ValidationLogger.debug(f"{column} after: {survey_df[column].dtype}") @@ -565,31 +551,22 @@ def check_ultfoc(value): @time_logger_wrap @exception_wrap -def validate_many_to_one(*args) -> pd.DataFrame: +def validate_many_to_one( + mapper: pd.DataFrame, col_many: str, col_one: str +) -> pd.DataFrame: """ - Validates a many-to-one mapper DataFrame. - This function performs the following checks: - 1. Checks if the mapper has two specified columns, referred to as 'col_many' and 'col_one'. - 2. Selects and deduplicates 'col_many' and 'col_one'. - 3. Checks that for each entry in 'col_many' there is exactly one corresponding entry in 'col_one'. + Validates a many to one mapper: + 1. Checks if the mapper has two columns col_many and col_one. + 2. Salects and deduplicates col_many and col_one. + 3. Checks that for each entry in col_many there is exactly one entry in + col_one. Args: - *args: Variable length argument list. It should contain the following items in order: - - df (pd.DataFrame): The input mapper DataFrame. - - col_many (str): The name of the column with many entries. - - col_one (str): The name of the column with one entry. - - Returns: - pd.DataFrame: The validated mapper DataFrame with deduplicated 'col_many' and 'col_one' columns. - - Raises: - ValueError: If the mapper does not have the 'col_many' and 'col_one' columns, or if there are multiple entries in 'col_one' for any entry in 'col_many'. + df (pd.DataFrame): The input mapper + col_many (str): name of the column with many entries + col_one (str): name of the column with one entry """ - - mapper = args[0] - col_many = args[1] - col_one = args[2] try: # Check that expected column are present cols = mapper.columns @@ -611,7 +588,7 @@ def validate_many_to_one(*args) -> pd.DataFrame: ValidationLogger.info( "The following codes have multile mapping: \n {df_bad}" ) - raise ValueError("Mapper is many to many") + raise ValueError(f"Mapper is many to many") return df except ValueError as ve: @@ -648,7 +625,7 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame: df["contents_check"] = status_check & from_status_check # Check if there are any False values in the "contents_check" column - if (df["contents_check"] == False).any(): # noqa + if (df["contents_check"] == False).any(): raise ValueError("Unexpected format within column contents") # Drop the "contents_check" column @@ -658,30 +635,3 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame: except ValueError as ve: raise ValueError("cora status mapper validation failed: " + str(ve)) - - -def flag_no_rand_spenders(df, raise_or_warn): - """ - Flags any records that answer "No" to "604" and also report their expenditure in "211" as more than 0. - - Parameters: - df (pandas.DataFrame): The input DataFrame. - - Returns: - None - """ - invalid_records = df.loc[(df["604"] == "No") & (df["211"] > 0)] - - if not invalid_records.empty: - if raise_or_warn == "raise": - raise Exception("Some records report no R&D, but spend in 211 > 0.") - elif raise_or_warn == "warn": - total_invalid_spend = invalid_records["211"].sum() - ValidationLogger.error("Some records report no R&D, but spend in 211 > 0.") - ValidationLogger.error( - f"The total spend of 'No' R&D companies is £{int(total_invalid_spend)}" - ) - ValidationLogger.error(invalid_records) - - else: - ValidationLogger.debug("All records have valid R&D spend.") From c215a842b259c763ea86ed18e265dba52587c316 Mon Sep 17 00:00:00 2001 From: Griffith Date: Thu, 11 Jan 2024 10:13:06 +0000 Subject: [PATCH 07/26] move the removal of filter qa to imputation --- src/developer_config.yaml | 1 - src/imputation/imputation_helpers.py | 54 ++++++++++++++++++++++++-- src/imputation/imputation_main.py | 19 ++++----- src/outlier_detection/auto_outliers.py | 8 ---- src/outputs/form_output_prep.py | 30 ++------------ src/outputs/outputs_main.py | 15 +------ 6 files changed, 66 insertions(+), 61 deletions(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index e07316c20..6432ea285 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -27,7 +27,6 @@ global: output_auto_outliers: False output_outlier_qa : False output_estimation_qa: False - output_imputation_qa: False output_apportionment_qa: False output_long_form: False output_short_form: False diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py index db4707e48..f12dffdc5 100644 --- a/src/imputation/imputation_helpers.py +++ b/src/imputation/imputation_helpers.py @@ -1,11 +1,11 @@ """Utility functions to be used in the imputation module.""" import logging - -from typing import List import pandas as pd - +from typing import List, Dict, Callable from itertools import chain +from src.outputs.status_filtered import output_status_filtered + ImputationHelpersLogger = logging.getLogger(__name__) @@ -219,3 +219,51 @@ def fill_sf_zeros(df: pd.DataFrame) -> pd.DataFrame: df.loc[(sf_mask & clear_mask), q] = df.copy()[q].fillna(0) return df + + +def tidy_imputation_dataframe( + df: pd.DataFrame, + config: Dict, + logger, + to_impute_cols: List, + write_csv: Callable, + run_id: int, + ) -> pd.DataFrame: + """Remove rows and columns not needed after imputation.""" + # Create lists for the qa cols + imp_cols = [f"{col}_imputed" for col in to_impute_cols] + + # Update the original breakdown questions and target variables with the imputed + df[to_impute_cols] = df[imp_cols] + + # Remove all qa columns + to_drop = [ + col + for col in df.columns + if (col.endswith("prev") | col.endswith("imputed") | col.endswith("link")) + ] + df = df.drop(columns=to_drop) + + # Keep only clear and imputed records + imputed_statuses = ["TMI", "CF", "MoR", "constructed"] + to_keep = df["imp_marker"].isin(imputed_statuses) | (df["imp_marker"] == "R") + + to_keep_df = df.copy().loc[to_keep] + filtered_output_df = df.copy().loc[~to_keep] + + # change the value of the status column to 'imputed' for imputed statuses + condition = to_keep_df["status"].isin(imputed_statuses) + to_keep_df.loc[condition, "status"] = "imputed" + + # Running status filtered full dataframe output for QA + if config["global"]["output_status_filtered"]: + logger.info("Starting status filtered output...") + output_status_filtered( + filtered_output_df, + config, + write_csv, + run_id, + ) + logger.info("Finished status filtered output.") + + return to_keep_df diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py index a023f982c..a08c6365c 100644 --- a/src/imputation/imputation_main.py +++ b/src/imputation/imputation_main.py @@ -141,13 +141,14 @@ def run_imputation( ImputationMainLogger.info("Finished Imputation calculation.") - # Create names for imputed cols - imp_cols = [f"{col}_imputed" for col in to_impute_cols] - - # Update the original breakdown questions and target variables with the imputed - imputed_df[to_impute_cols] = imputed_df[imp_cols] - - # Drop imputed values from df - imputed_df = imputed_df.drop(columns=imp_cols) - + # remove rows and columns no longer needed from the imputed dataframe + imputed_df = hlp.tidy_imputation_dataframe( + imputed_df, + config, + ImputationMainLogger, + to_impute_cols, + write_csv, + run_id, + ) + return imputed_df diff --git a/src/outlier_detection/auto_outliers.py b/src/outlier_detection/auto_outliers.py index 26eae3583..b3eaa3918 100644 --- a/src/outlier_detection/auto_outliers.py +++ b/src/outlier_detection/auto_outliers.py @@ -238,14 +238,6 @@ def run_auto_flagging( # loop through all columns to be flagged for outliers for value_col in flag_value_cols: - # to_numeric is needed to convert strings. However 'coerce' - # means values that - # can't be converted are represented by NaNs. - # TODO data validation and cleaning should replace the need for - # 'to_numeric' - # check ticket (RDRP-386) - df[value_col] = pd.to_numeric(df[value_col], errors="coerce") - # Call function to add a flag for auto outliers in column value_col df = flag_outliers(df, upper_clip, lower_clip, value_col) diff --git a/src/outputs/form_output_prep.py b/src/outputs/form_output_prep.py index 0e3898484..6ef352657 100644 --- a/src/outputs/form_output_prep.py +++ b/src/outputs/form_output_prep.py @@ -28,26 +28,13 @@ def form_output_prep( filtered_output_df (pd.DataFrame): data noot used in outputs """ - - imputed_statuses = ["TMI", "CF", "MoR", "constructed"] - - to_keep = estimated_df["imp_marker"].isin(imputed_statuses) | ( - estimated_df["imp_marker"] == "R" - ) - # Deal with "No" in 604, also eliminating spenders flag_no_rand_spenders(estimated_df, "error") no_rnd_spenders_filter = ~( (estimated_df["604"] == "No") & (estimated_df["211"] > 0) ) - estimated_df = estimated_df.copy().loc[no_rnd_spenders_filter] - - # filter estimated_df and weighted_df to only include clear or imputed statuses - outputs_df = estimated_df.copy().loc[to_keep] - tau_outputs_df = weighted_df.copy().loc[to_keep] - - # filter estimated_df for records not included in outputs - filtered_output_df = estimated_df.copy().loc[~to_keep] + outputs_df = estimated_df.copy().loc[no_rnd_spenders_filter] + tau_outputs_df = weighted_df.copy().loc[no_rnd_spenders_filter] if ni_full_responses is not None: # Add required columns to NI data @@ -66,19 +53,10 @@ def form_output_prep( # outputs_df = pd.concat([outputs_df, ni_full_responses]) tau_outputs_df = pd.concat([tau_outputs_df, ni_full_responses]) - # change the value of the status column to 'imputed' for imputed statuses - condition = outputs_df["status"].isin(imputed_statuses) - outputs_df.loc[condition, "status"] = "imputed" - - return ni_full_responses, outputs_df, tau_outputs_df, filtered_output_df + return ni_full_responses, outputs_df, tau_outputs_df else: - - # change the value of the status column to 'imputed' for imputed statuses - condition = outputs_df["status"].isin(imputed_statuses) - outputs_df.loc[condition, "status"] = "imputed" - # create an empty ni_responses dataframe ni_full_responses = pd.DataFrame() - return ni_full_responses, outputs_df, tau_outputs_df, filtered_output_df + return ni_full_responses, outputs_df, tau_outputs_df diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py index c61280772..dc93b367d 100644 --- a/src/outputs/outputs_main.py +++ b/src/outputs/outputs_main.py @@ -4,7 +4,6 @@ from typing import Callable, Dict, Any from src.outputs.form_output_prep import form_output_prep -from src.outputs.status_filtered import output_status_filtered from src.outputs.short_form import output_short_form from src.outputs.long_form import output_long_form from src.outputs.tau import output_tau @@ -64,8 +63,7 @@ def run_outputs( ( ni_full_responses, outputs_df, - tau_outputs_df, - filtered_output_df, + tau_outputs_df ) = form_output_prep( estimated_df, weighted_df, @@ -74,17 +72,6 @@ def run_outputs( sic_pg_alpha, ) - # Running status filtered full dataframe output for QA - if config["global"]["output_status_filtered"]: - OutputMainLogger.info("Starting status filtered output...") - output_status_filtered( - filtered_output_df, - config, - write_csv, - run_id, - ) - OutputMainLogger.info("Finished status filtered output.") - # Running short form output if config["global"]["output_short_form"]: OutputMainLogger.info("Starting short form output...") From 2a42dcb0922eeb2a6cde4c2c08990821d642c309 Mon Sep 17 00:00:00 2001 From: Griffith Date: Thu, 11 Jan 2024 10:36:00 +0000 Subject: [PATCH 08/26] updated the filtered_qa_schema toml --- .../status_filtered_qa_schema.toml | 280 ------------------ src/imputation/imputation_helpers.py | 4 + 2 files changed, 4 insertions(+), 280 deletions(-) diff --git a/config/output_schemas/status_filtered_qa_schema.toml b/config/output_schemas/status_filtered_qa_schema.toml index 148a8cb81..2e520fe95 100644 --- a/config/output_schemas/status_filtered_qa_schema.toml +++ b/config/output_schemas/status_filtered_qa_schema.toml @@ -646,286 +646,6 @@ Deduced_Data_Type = "object" old_name = "imp_class" Deduced_Data_Type = "object" -[202_prev] -old_name = "202_prev" -Deduced_Data_Type = "float64" - -[203_prev] -old_name = "203_prev" -Deduced_Data_Type = "float64" - -[204_prev] -old_name = "204_prev" -Deduced_Data_Type = "float64" - -[205_prev] -old_name = "205_prev" -Deduced_Data_Type = "float64" - -[206_prev] -old_name = "206_prev" -Deduced_Data_Type = "float64" - -[207_prev] -old_name = "207_prev" -Deduced_Data_Type = "float64" - -[209_prev] -old_name = "209_prev" -Deduced_Data_Type = "float64" - -[210_prev] -old_name = "210_prev" -Deduced_Data_Type = "float64" - -[211_prev] -old_name = "211_prev" -Deduced_Data_Type = "float64" - -[212_prev] -old_name = "212_prev" -Deduced_Data_Type = "float64" - -[214_prev] -old_name = "214_prev" -Deduced_Data_Type = "float64" - -[216_prev] -old_name = "216_prev" -Deduced_Data_Type = "float64" - -[218_prev] -old_name = "218_prev" -Deduced_Data_Type = "float64" - -[219_prev] -old_name = "219_prev" -Deduced_Data_Type = "float64" - -[220_prev] -old_name = "220_prev" -Deduced_Data_Type = "float64" - -[221_prev] -old_name = "221_prev" -Deduced_Data_Type = "float64" - -[222_prev] -old_name = "222_prev" -Deduced_Data_Type = "float64" - -[223_prev] -old_name = "223_prev" -Deduced_Data_Type = "float64" - -[225_prev] -old_name = "225_prev" -Deduced_Data_Type = "float64" - -[226_prev] -old_name = "226_prev" -Deduced_Data_Type = "float64" - -[227_prev] -old_name = "227_prev" -Deduced_Data_Type = "float64" - -[228_prev] -old_name = "228_prev" -Deduced_Data_Type = "float64" - -[229_prev] -old_name = "229_prev" -Deduced_Data_Type = "float64" - -[237_prev] -old_name = "237_prev" -Deduced_Data_Type = "float64" - -[242_prev] -old_name = "242_prev" -Deduced_Data_Type = "float64" - -[243_prev] -old_name = "243_prev" -Deduced_Data_Type = "float64" - -[244_prev] -old_name = "244_prev" -Deduced_Data_Type = "float64" - -[245_prev] -old_name = "245_prev" -Deduced_Data_Type = "float64" - -[246_prev] -old_name = "246_prev" -Deduced_Data_Type = "float64" - -[247_prev] -old_name = "247_prev" -Deduced_Data_Type = "float64" - -[248_prev] -old_name = "248_prev" -Deduced_Data_Type = "float64" - -[249_prev] -old_name = "249_prev" -Deduced_Data_Type = "float64" - -[250_prev] -old_name = "250_prev" -Deduced_Data_Type = "float64" - -[302_prev] -old_name = "302_prev" -Deduced_Data_Type = "float64" - -[303_prev] -old_name = "303_prev" -Deduced_Data_Type = "float64" - -[304_prev] -old_name = "304_prev" -Deduced_Data_Type = "float64" - -[305_prev] -old_name = "305_prev" -Deduced_Data_Type = "float64" - -[emp_researcher_prev] -old_name = "emp_researcher_prev" -Deduced_Data_Type = "float64" - -[emp_technician_prev] -old_name = "emp_technician_prev" -Deduced_Data_Type = "float64" - -[emp_other_prev] -old_name = "emp_other_prev" -Deduced_Data_Type = "float64" - -[emp_total_prev] -old_name = "emp_total_prev" -Deduced_Data_Type = "float64" - -[headcount_res_m_prev] -old_name = "headcount_res_m_prev" -Deduced_Data_Type = "float64" - -[headcount_res_f_prev] -old_name = "headcount_res_f_prev" -Deduced_Data_Type = "float64" - -[headcount_tec_m_prev] -old_name = "headcount_tec_m_prev" -Deduced_Data_Type = "float64" - -[headcount_tec_f_prev] -old_name = "headcount_tec_f_prev" -Deduced_Data_Type = "float64" - -[headcount_oth_m_prev] -old_name = "headcount_oth_m_prev" -Deduced_Data_Type = "float64" - -[headcount_oth_f_prev] -old_name = "headcount_oth_f_prev" -Deduced_Data_Type = "float64" - -[headcount_tot_m_prev] -old_name = "headcount_tot_m_prev" -Deduced_Data_Type = "float64" - -[headcount_tot_f_prev] -old_name = "headcount_tot_f_prev" -Deduced_Data_Type = "float64" - -[headcount_total_prev] -old_name = "headcount_total_prev" -Deduced_Data_Type = "float64" - -[211_link] -old_name = "211_link" -Deduced_Data_Type = "float64" - -[305_link] -old_name = "305_link" -Deduced_Data_Type = "float64" - -[emp_researcher_link] -old_name = "emp_researcher_link" -Deduced_Data_Type = "float64" - -[emp_technician_link] -old_name = "emp_technician_link" -Deduced_Data_Type = "float64" - -[emp_other_link] -old_name = "emp_other_link" -Deduced_Data_Type = "float64" - -[headcount_res_m_link] -old_name = "headcount_res_m_link" -Deduced_Data_Type = "float64" - -[headcount_res_f_link] -old_name = "headcount_res_f_link" -Deduced_Data_Type = "float64" - -[headcount_tec_m_link] -old_name = "headcount_tec_m_link" -Deduced_Data_Type = "float64" - -[headcount_tec_f_link] -old_name = "headcount_tec_f_link" -Deduced_Data_Type = "float64" - -[headcount_oth_m_link] -old_name = "headcount_oth_m_link" -Deduced_Data_Type = "float64" - -[headcount_oth_f_link] -old_name = "headcount_oth_f_link" -Deduced_Data_Type = "float64" - -[200_original] -old_name = "200_original" -Deduced_Data_Type = "float64" - -[pg_sic_class] -old_name = "pg_sic_class" -Deduced_Data_Type = "object" - -[empty_pgsic_group] -old_name = "empty_pgsic_group" -Deduced_Data_Type = "object" - -[empty_pg_group] -old_name = "empty_pg_group" -Deduced_Data_Type = "object" - -[200_imp_marker] -old_name = "200_imp_marker" -Deduced_Data_Type = "object" - -[211_trim] -old_name = "211_trim" -Deduced_Data_Type = "object" - -[305_trim] -old_name = "305_trim" -Deduced_Data_Type = "object" - -[manual_trim] -old_name = "manual_trim" -Deduced_Data_Type = "object" - -[sf_expansion_grouping] -old_name = "sf_expansion_grouping" -Deduced_Data_Type = "object" - [auto_outlier] old_name = "auto_outlier" Deduced_Data_Type = "bool" diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py index f12dffdc5..f91f646b5 100644 --- a/src/imputation/imputation_helpers.py +++ b/src/imputation/imputation_helpers.py @@ -242,6 +242,10 @@ def tidy_imputation_dataframe( for col in df.columns if (col.endswith("prev") | col.endswith("imputed") | col.endswith("link")) ] + + to_drop += ["200_original", "pg_sic_class", "empty_pgsic_group", "empty_pg_group"] + to_drop += ["200_imp_marker", "211_trim", "305_trim", "manual_trim"] + to_drop += ["sf_expansion_grouping"] df = df.drop(columns=to_drop) # Keep only clear and imputed records From 638e5c153876cbfc65fcde55f8d15c4e970e2704 Mon Sep 17 00:00:00 2001 From: Griffith Date: Thu, 11 Jan 2024 11:02:18 +0000 Subject: [PATCH 09/26] correct the cols to be dropped at the end of imputation --- src/imputation/imputation.py | 581 ----------- src/imputation/imputation_helpers.py | 1 - tests/test_imputation/test_imputation.py | 1173 ---------------------- 3 files changed, 1755 deletions(-) delete mode 100644 src/imputation/imputation.py delete mode 100644 tests/test_imputation/test_imputation.py diff --git a/src/imputation/imputation.py b/src/imputation/imputation.py deleted file mode 100644 index a8c2c8c60..000000000 --- a/src/imputation/imputation.py +++ /dev/null @@ -1,581 +0,0 @@ -import pandas as pd -import numpy as np -import logging - -# TODO almost each could be further generalised in terms of -# variable and function names - -ImputationLogger = logging.getLogger(__name__) - - -def filter_by_column_content( - raw_df: pd.DataFrame, column: str, column_content: str -) -> pd.DataFrame: - """Filter a column for specific string content. - - Args: - raw_df (pd.DataFrame): The dataframe to be filtered. - column (str): The name of the column to be filtered. - column_content (str): The content to be filtered on. - - Returns: - pd.DataFrame: The filtered dataframe. - """ - # filter for rows with column_content - clean_df = raw_df[raw_df[column] == column_content].copy() - - return clean_df - - -def rename_imp_col(clean_df: pd.DataFrame): - """ - This function renames columns in dataframe, replacing civ_or_def with 200 - and Product_group with 201 if they are present. - - Args: - clean_df (pd.DataFrame): Input Dataframe to rename columns. - - Returns: - pd.Dataframe: returns dataframe with renamed columns. - """ - if "civ_or_def" in clean_df.columns: - clean_df = clean_df.rename(columns={"civ_or_def": "200"}) - - if "Product_group" in clean_df.columns: - clean_df = clean_df.rename(columns={"Product_group": "201"}) - - return clean_df - - -def create_imp_class_col( - clean_df: pd.DataFrame, col_first_half: str, col_second_half: str, class_name: str -) -> pd.DataFrame: - """_summary_ - - Args: - clean_df (_type_): _description_ - - Returns: - _type_: _description_ - """ - - # TODO remove when using real data - clean_df[f"{col_second_half}"] = clean_df[f"{col_second_half}"].astype(str) - - # Create class col with concatenation - clean_df[f"{class_name}"] = ( - clean_df[f"{col_first_half}"] + "_" + clean_df[f"{col_second_half}"] - ) - - return clean_df - - -def filter_same_class( - clean_df: pd.DataFrame, current_period: str, previous_period: str -) -> pd.DataFrame: - """_summary_ - Args: - clean_df (_type_): _description_ - - Returns: - _type_: _description_ - """ - - # Filter for cols with same contents - clean_same_class_df = clean_df[ - clean_df[f"{current_period}_class"] == clean_df[f"{previous_period}_class"] - ].copy() - - return clean_same_class_df - - -def filter_pairs( - clean_same_class_df: pd.DataFrame, - target_variable: str, - current_period: str, - previous_period: str, -) -> pd.DataFrame: - """_summary_ Checks two columns have same contents - - Args: - clean_same_class_df (_type_): _description_ - - Returns: - _type_: _description_ - """ - # TODO needs more tweeks but essentially same as - # filter_same_class but for target var not class - matched_pairs_df = clean_same_class_df[ - (clean_same_class_df[f"{current_period}_{target_variable}_status"] == "Present") - & ( - clean_same_class_df[f"{previous_period}_{target_variable}_status"] - == "Present" - ) - ].copy() - - return matched_pairs_df - - -def flag_nulls_and_zeros( - target_variables_list: list, - df: pd.DataFrame, - curr_q: str, - prev_q: str, -): - """Flag target variables containing nulls or zreos. - - A new column {var}_valid is created for each var in the target variables. - This is flagged with 1 if either the current period or previous period - contains either a null or a zero. Otherwise, the flag is 0. - - Args: - target_variables (list of str): the target variables - df (pd.DataFrame): dataframe with current and previous periods - curr_q (str): the current period - prev_q (str): the previous period - - Returns: - pd.DataFrame - a dataframe indicating nulls and zeros in target cols. - """ - df = df.copy() - for var in target_variables_list: - cond1 = (df[f"{curr_q}_{var}"].isnull()) | (df[f"{prev_q}_{var}"].isnull()) - cond2 = (df[f"{curr_q}_{var}"] == 0) | (df[f"{prev_q}_{var}"] == 0) - df[f"{var}_valid"] = np.where(cond1 | cond2, False, True) - - return df - - -def calc_growth_ratio( - target_variable: str, - df: pd.DataFrame, - current_period: int, - previous_period: int, -) -> pd.DataFrame: - """Calculate the growth ratio for imputation. - - For the current target_variable, a growth_ratio column is created. - A growth rate is calculated for those rows where the "target_value_valid" - is true, meaning that there are no nulls or zeros in the previous or - current periods, TODO and the status is a 'responder' status. - - If this condition is not met, the row has a null value in this column. - - Args: - target_variable (str): The column name of the target variable. - df (pd.DataFrame): The dataframe containing the target variables. - current_period - - Returns: - pd.DataFrame - """ - flagged_df = flag_nulls_and_zeros( - [target_variable], df, current_period, previous_period - ) - - responder_statuses = ["Clear", "Clear - overridden", "Clear - overridden SE"] - - cond1 = flagged_df[f"{target_variable}_valid"] - cond2 = flagged_df["status"].isin(responder_statuses) - - flagged_df[f"{target_variable}_growth_ratio"] = np.where( - cond1 & cond2, - ( - df[f"{current_period}_{target_variable}"] - / df[f"{previous_period}_{target_variable}"] - ), - np.nan, - ) - df = flagged_df.drop(columns=[f"{target_variable}_valid"]) - - return df - - -def sort_df(target_variable: str, df: pd.DataFrame) -> pd.DataFrame: - """_summary_ - - Args: - target_variable (_type_): _description_ - - Returns: - _type_: _description_ - """ - # import ipdb - - # ipdb.set_trace() - # sorted based on hard coded list (in arg by=) - sorted_df = df.sort_values( - by=[ - "200", - "201", - f"{target_variable}_growth_ratio", - "employees", - "reference", - ], - ascending=[True, True, True, False, True], - ) - sorted_df.reset_index(drop=True, inplace=True) - - return sorted_df - - -def trim_check( - df: pd.DataFrame, check_value=10 -) -> pd.DataFrame: # TODO add check_value to a cofig - """_summary_ - - Args: - df (pd.DataFrame, check_value, optional): _description_ - Defaults to 10)->pd.DataFrame(. - - Returns: - _type_: _description_ - """ - # tag for those classes with more than check_value (currently 10) - if len(df) <= check_value: # TODO or is this just < - df["trim_check"] = "below_trim_threshold" - else: - df["trim_check"] = "above_trim_threshold" - - return df - - -def trim_bounds( - df: pd.DataFrame, - lower_perc=15, # TODO add percentages to config - - # check method inBERD_imputation_spec_V3 - upper_perc=15, -) -> pd.DataFrame: - """_summary_ - - Args: - df (pd.DataFrame, lower_perc, optional): _description_. - Defaults to 15, TODO add percentages to config - - Returns: - _type_: _description_ - """ - # trim only if more than 10 - df = filter_by_column_content(df, "trim_check", "above_trim_threshold") - df.reset_index(drop=True, inplace=True) - - # define the bounds for trimming - remove_lower = np.ceil(len(df) * (lower_perc / 100)) - remove_upper = np.ceil(len(df) * (1 - upper_perc / 100)) - - # create trim tag (distinct from trim_check) - # to mark which to trim for mean growth ratio - df["trim"] = "do trim" - df.loc[ - remove_lower : remove_upper - 2, "trim" - ] = "dont trim" # TODO check if needs to be inclusive of exlusive - - return df - - -def get_mean_growth_ratio( - df: pd.DataFrame, - dict_mean_growth_ratio: dict, # TODO maybe rename to more decriptive name - unique_item: str, - target_variable: str, -) -> pd.DataFrame: - """_summary_ - - Args: - dict_mean_growth_ratio (_type_): _description_ - - Returns: - _type_: _description_ - """ - """Including the count of matched pairs -for each imputed variable and imputation -class in the output would be helpful for -the RAP team and MQD to determine the -quality of the imputed value. """ - - # remove the "trim" tagged rows - df_trimmed = filter_by_column_content(df, "trim", "dont trim") - - dict_mean_growth_ratio[ - f"{unique_item}_{target_variable}_mean_growth_ratio and count" - ] = [ - df_trimmed[f"{target_variable}_growth_ratio"].mean(), - len(df_trimmed), - ] # TODO check same len(df[f'{target_variable}_growth_ratio'] and len(df) - # Also add to a dataframe: - # df[f'{target_variable}_mean_growth_ratio'] = \ - # df[f'{target_variable}_growth_ratio'].mean() - - return dict_mean_growth_ratio # TODO aka "imputation links" - # what naming is best? - - -def loop_unique( - df: pd.DataFrame, # TODO think of a better name for function - column: str, - target_variables_list: list, - current_period: str, - previous_period: str, - dict_mean_growth_ratio={}, -) -> pd.DataFrame: - """_summary_ - - Args: - df (_type_): _description_ - - Returns: - _type_: _description_ - """ - # will be looping over the class col - # dict_mean_growth_ratio = {} # TODO change to dict at the end - # growth_ratio_dfs_list = [] - # for subsets of class and then on target variable at a time - # growht ratio in calculated, data is sorted, trim check done, - # trim bounds calculated and labelled then mean growth ratio - # calculated and stored in a dictionary - for unique_item in df[column].unique(): - unique_item_df = df[df[column] == unique_item].copy() - for target_variable in target_variables_list: - growth_ratio_df = calc_growth_ratio( - target_variable, unique_item_df, current_period, previous_period - ) - sorted_df = sort_df(target_variable, growth_ratio_df) - trim_check_df = trim_check(sorted_df) - trimmed_df = trim_bounds(trim_check_df) - - dict_mean_growth_ratio = get_mean_growth_ratio( - trimmed_df, dict_mean_growth_ratio, unique_item, target_variable - ) - # growth_ratio_dfs_list.append(growth_ratio_df) - # could also store in a df? - - # growth_ratio_df = pd.concat(growth_ratio_dfs_list) - # could also store ina dataframe - - return dict_mean_growth_ratio # , growth_ratio_df - # aka "imputation links" - what naming is best? - - -# TODO break this function into smaller functions -def forward_imputation( - df: pd.DataFrame, - column: str, - target_variables_list: list, - current_period: str, - previous_period: str, -) -> pd.DataFrame: - """_summary_ - - Args: - df (_type_): _description_ - - Returns: - _type_: _description_ - """ - - df_growth_ratio = df[~df.isin([np.nan]).any(axis=1)].copy() - # df_growth_ratio = df[ - # df[f"{current_period}_var1"] != "missing" - # ].copy() # TODO add f string - - dict_mean_growth_ratio = loop_unique( - df_growth_ratio, - column, - target_variables_list, - current_period, - previous_period, - ) - - dfs_list = [] - df_final = df.copy() - for class_name in df_final[f"{current_period}_class"].unique(): - for var in target_variables_list: - df_other = df_final[ - df_final[f"{current_period}_class"] == class_name - ].copy() - df_other = df_other[ - df_other[f"{current_period}_{var}"].isnull() - ].copy() # change the name of df_final and df_other - - df_other[f"{class_name}_{var}_growth_ratio"] = dict_mean_growth_ratio[ - f"{class_name}_{var}_mean_growth_ratio and count" - ][0] - df_other[f"forwards_imputed_{var}"] = round( - df_other[f"{class_name}_{var}_growth_ratio"] - * df_other[f"{previous_period}_{var}"] - ).astype("Int64") - - df_other = df_other.drop(columns=[f"{class_name}_{var}_growth_ratio"]) - dfs_list.append(df_other) - - df_out = pd.concat(dfs_list) - - return df_out - - -# TODO break this function into smaller functions -def backwards_imputation( - df: pd.DataFrame, - column: str, - target_variables_list: list, - current_period: str, - previous_period: str, -) -> pd.DataFrame: - """_summary_ - - Args: - df (_type_): _description_ - - Returns: - _type_: _description_ - """ - - df_growth_ratio = df[~df.isin([np.nan]).any(axis=1)].copy() - # df_growth_ratio = df[ - # df[f"{previous_period}_var1"] != "missing" - # ].copy() # TODO add f string - - dict_mean_growth_ratio = loop_unique( - df_growth_ratio, - column, - target_variables_list, - current_period, - previous_period, - ) - - dfs_list = [] - df_final = df.copy() - for class_name in df_final[f"{current_period}_class"].unique(): - for var in target_variables_list: - df_other = df_final[ - df_final[f"{current_period}_class"] == class_name - ].copy() - df_other = df_other[ - df_other[f"{previous_period}_{var}"].isnull() - ].copy() # TODO change the name of df_final and df_other - # TODO add f string to previous_period_var1 - df_other[f"{class_name}_{var}_growth_ratio"] = dict_mean_growth_ratio[ - f"{class_name}_{var}_mean_growth_ratio and count" - ][0] - df_other[f"backwards_imputed_{var}"] = round( - df_other[f"{current_period}_{var}"] - / df_other[f"{class_name}_{var}_growth_ratio"] - ).astype("Int64") - df_other = df_other.drop(columns=[f"{class_name}_{var}_growth_ratio"]) - dfs_list.append(df_other) - - df_out = pd.concat(dfs_list) - - return df_out - - -def run_imputation( - # full_responses: pd.DataFrame, # df = full_responses.copy() - # column: str, - test_df, - target_variables_list: list, - current_period: str, - previous_period: str, -) -> pd.DataFrame: - """_summary_ - - Args: - df (_type_): _description_ - - Returns: - _type_: _description_ - """ - - # replacing civ_or_def with 200 and Product_group with 201 - test_df = rename_imp_col(test_df) - - # q200 is Business or business R&D type - # q201 is Product Group - clean_df = create_imp_class_col(test_df, "200", "201", f"{current_period}_class") - clean_df.reset_index(drop=True, inplace=True) - - # TODO:flag_nulls_and_zeros() could can optionally be run to output a QA csv - # indicating where there are nulls and zeros in the target variables - # flagged_df = flag_nulls_and_zeros( - # target_variables_list, clean_df, current_period, previous_period - # ) - - forward_df = forward_imputation( - clean_df, - f"{current_period}_class", - target_variables_list, - current_period, - previous_period, - ) - - backwards_df = backwards_imputation( - clean_df, - f"{current_period}_class", - target_variables_list, - current_period, - previous_period, - ) - - return forward_df, backwards_df - - -def update_imputed( - full_resp_df, - imputed_vals_df, - target_variables_list, - imputation_direction, - ref_col="reference", -) -> pd.DataFrame: - """Updates missing response data with imputed values for target variables - - Keyword Arguments: - full_resp_df -- DataFrame of the response data - imputed_vals_df -- DataFrame contining imputed values calculated in - imputation module - target_variables_list -- list of variable that need imputed if no - response - imputation_direction -- can be either "forwards" or "backwards" depending on - whether current or previous period has no response - - Returns: - full_resp_df: DataFrame with missing exchanged for imputed values - for target variables - """ - - # Validate the input dataframes checking for columns - if not all( - col in full_resp_df.columns for col in [ref_col] + target_variables_list - ): - ImputationLogger.debug("There are some cols missing in full responses.") - raise ValueError("One or more columns are missing in full_resp_df") - - if not all( - col in imputed_vals_df.columns - for col in [ref_col] - + [f"{imputation_direction}_imputed_{col}" for col in target_variables_list] - ): - ImputationLogger.debug("There are some cols missing in imputed_vals_df.") - raise ValueError("One or more columns are missing in imputed_vals_df") - - # add imputed tag column - full_resp_df["imputation_marker"] = "response" - imputed_vals_df["imputation_marker"] = f"{imputation_direction}_imputed" - - # exchange reference col for index - # in preparation for update function - full_resp_df.index = full_resp_df[ref_col] - imputed_vals_df.index = imputed_vals_df[ref_col] - - # rename cols in preparation for update function - for col in target_variables_list: - imputed_vals_df = imputed_vals_df.rename( - columns={f"{imputation_direction}_imputed_{col}": col} - ) - - # apply update - changes input_full inplace - full_resp_df.update(imputed_vals_df) - - # change index back to normal - full_resp_df = full_resp_df.reset_index(drop=True) - - return full_resp_df diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py index f91f646b5..0e6622a41 100644 --- a/src/imputation/imputation_helpers.py +++ b/src/imputation/imputation_helpers.py @@ -245,7 +245,6 @@ def tidy_imputation_dataframe( to_drop += ["200_original", "pg_sic_class", "empty_pgsic_group", "empty_pg_group"] to_drop += ["200_imp_marker", "211_trim", "305_trim", "manual_trim"] - to_drop += ["sf_expansion_grouping"] df = df.drop(columns=to_drop) # Keep only clear and imputed records diff --git a/tests/test_imputation/test_imputation.py b/tests/test_imputation/test_imputation.py deleted file mode 100644 index 024411750..000000000 --- a/tests/test_imputation/test_imputation.py +++ /dev/null @@ -1,1173 +0,0 @@ -import numpy as np -import pandas as pd -from pandas._testing import assert_frame_equal -from pandas import DataFrame as pandasDF - -from src.imputation.imputation import ( - update_imputed, - run_imputation, - backwards_imputation, - forward_imputation, - loop_unique, - get_mean_growth_ratio, - trim_bounds, - trim_check, - calc_growth_ratio, - sort_df, - filter_by_column_content, - create_imp_class_col, - filter_same_class, - filter_pairs, - flag_nulls_and_zeros, -) - - -class TestCleanData: # usetag - """Unit test for filter_by_column_content""" - - def input_data_filter_by_column_content(self): - """Create input data for the filter_by_column_content function""" - - # columns for the dataframe - input_cols = ["clean_check"] - - # data in the column order above - input_data = [["clean"], ["not_clean"]] - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_filter_by_column_content(self): - """Create output data for the filter_by_column_content function""" - - # columns for the dataframe - output_cols = ["clean_check"] - - # data in the column order above - output_data = [["clean"]] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def test_filter_by_column_content(self): - """Test the expected functionality""" - - df_input = self.input_data_filter_by_column_content() - df_expout = self.output_data_filter_by_column_content() - column = "clean_check" - column_content = "clean" - df_result = filter_by_column_content( - df_input, column, column_content - ) # add period filter functionality - assert_frame_equal(df_result, df_expout) - - -class TestCreateClassCol: - """Unit test for create_imp_class_col""" - - def input_data_create_imp_class_col(self): - """Create input data for the create_imp_class_col function""" - - input_cols = ["200", "201"] - - input_data = [["C", "AG"]] - - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_create_imp_class_col(self): - """Create output data for the create_imp_class_col function""" - - output_cols = ["200", "201", "class"] - - output_data = [["C", "AG", "C_AG"]] - - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def test_create_imp_class_col(self): - """Test the expected functionality""" - - df_input = self.input_data_create_imp_class_col() - df_expout = self.output_data_create_imp_class_col() - - col_first_half = "200" - col_second_half = "201" - class_name = "class" - - df_result = create_imp_class_col( - df_input, col_first_half, col_second_half, class_name - ) # add period filter functionality - assert_frame_equal(df_result, df_expout) - - -class TestFilterSameClass: - """Unit test for filter_same_class""" - - def input_data_filter_same_class(self): - """Create input data for the filter_same_class function""" - - # columns for the dataframe - input_cols = ["company_ref", "190012_class", "190009_class"] - - # data in the column order above - input_data = [ - [1, "class1", "class1"], - [10, "class1", "class2"], - [20, "class2", "class1"], - ] - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_filter_same_class(self): - """Create output data for the filter_same_class function""" - - # columns for the dataframe - output_cols = ["company_ref", "190012_class", "190009_class"] - - # data in the column order above - output_data = [[1, "class1", "class1"]] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def test_filter_same_class(self): - """Test the expected functionality""" - - df_input = self.input_data_filter_same_class() - df_expout = self.output_data_filter_same_class() - - current_period = "190012" - previous_period = "190009" - - df_result = filter_same_class(df_input, current_period, previous_period) - assert_frame_equal(df_result, df_expout) - - -class TestFilterPairs: - """Unit test for filter_pairs""" - - def input_data_filter_pairs(self): - """Create input data for the filter_pairs function""" - - # columns for the dataframe - input_cols = ["company_ref", "190012_target_status", "190009_target_status"] - - # data in the column order above - input_data = [ - [1, "Present", "Present"], - [10, "Missing", "Present"], - [20, "Present", "Missing"], - ] - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_filter_pairs(self): - """Create output data for the filter_pairs function""" - - # columns for the dataframe - output_cols = ["company_ref", "190012_target_status", "190009_target_status"] - - # data in the column order above - output_data = [[1, "Present", "Present"]] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def test_filter_pairs(self): - """Test the expected functionality""" - - df_input = self.input_data_filter_pairs() - df_expout = self.output_data_filter_pairs() - - target_variable = "target" - current_period = "190012" - previous_period = "190009" - - df_result = filter_pairs( - df_input, target_variable, current_period, previous_period - ) # add period filter functionality - assert_frame_equal(df_result, df_expout) - - -class TestFlagNullsZeros: - """Unit tests for flag_nulls_zeros.""" - - def input_data(self): - """Create dataframe for input data.""" - input_schema = { - "ref": "Int64", - "curr_var1": "Int64", - "prev_var1": "Int64", - "curr_var2": "Int64", - "prev_var2": "Int64", - } - - input_data = [ - [1, 100, np.nan, 0, 201], - [2, 100, 101, 200, 201], - [3, np.nan, 101, 200, 201], - [4, 100, 101, 200, 201], - [5, 100, np.nan, 200, 201], - [6, 100, 101, 200, 201], - [7, 100, 101, 0, 201], - [8, 100, 101, 200, 0], - ] - - input_df = pandasDF(data=input_data, columns=input_schema.keys()).astype( - input_schema - ) - - return input_df - - def output_data(self): - """Create dataframe for output data.""" - out_schema = { - "ref": "Int64", - "curr_var1": "Int64", - "prev_var1": "Int64", - "curr_var2": "Int64", - "prev_var2": "Int64", - "var1_valid": "Bool", - "var2_valid": "Bool", - } - - output_data = [ - [1, 100, np.nan, 0, 201, False, False], - [2, 100, 101, 200, 201, True, True], - [3, np.nan, 101, 200, 201, False, True], - [4, 100, 101, 200, 201, True, True], - [5, 100, np.nan, 200, 201, False, True], - [6, 100, 101, 200, 201, True, True], - [7, 100, 101, 0, 201, True, False], - [8, 100, 101, 200, 0, True, False], - ] - - output_df = pandasDF(data=output_data, columns=out_schema.keys()).astype( - out_schema - ) - - return output_df - - def test_flag_nulls_and_zeros(self): - """Unit test for flag_nulls_and_zeros.""" - df_expout = self.output_data() - input_df = self.input_data() - df_result = flag_nulls_and_zeros(["var1", "var2"], input_df, "curr", "prev") - assert_frame_equal(df_result, df_expout) - - -class TestCalcGrowthRatio: - """Unit test for calc_growth_ratio""" - - def input_data_calc_growth_ratio(self): - """Create input data for the calc_growth_ratio function""" - - input_cols = { - "status": "str", - "current_var1": "Int64", - "previous_var1": "Int64", - "current_var2": "Int64", - "previous_var2": "Int64", - } - - input_data = [ - ["Clear", 2, 8, 2, 4], - ["Clear", 3, 6, 2, np.nan], - ["Clear", np.nan, 8, np.nan, 4], - ["Clear", 2, 1, 2, 4], - ["Form sent out", 5, 3, 2, 4], - ] - - input_df = pandasDF(data=input_data, columns=input_cols.keys()).astype( - input_cols - ) - - return input_df - - def output_data_calc_growth_ratio( - self, - ): # 'Imputed(Fwd)','Imputed(Bwd)', 'ACTUAL', 'Const(Prog)' - """Create output data for the calc_growth_ratio function""" - - output_cols = { - "status": "str", - "current_var1": "Int64", - "previous_var1": "Int64", - "current_var2": "Int64", - "previous_var2": "Int64", - "var1_growth_ratio": "float", - } - - output_data = [ - ["Clear", 2, 8, 2, 4, 0.25], - ["Clear", 3, 6, 2, np.nan, 0.5], - ["Clear", np.nan, 8, np.nan, 4, np.nan], - ["Clear", 2, 1, 2, 4, 2.0], - ["Form sent out", 5, 3, 2, 4, np.nan], - ] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols.keys()).astype( - output_cols - ) - - return df_expout - - def test_calc_growth_ratio(self): - """Test the expected functionality""" - - target_variable = "var1" - input_df = self.input_data_calc_growth_ratio() - df_expout = self.output_data_calc_growth_ratio() - current_period = "current" - previous_period = "previous" - - print(input_df, "\n", df_expout) - df_result = calc_growth_ratio( - target_variable, input_df, current_period, previous_period - ) - assert_frame_equal(df_result, df_expout) - - -class TestSortDf: - """Unit test for sort_df""" - - def input_data_sort_df(self): - """Create input data for the sort_df function""" - - # columns for the dataframe - input_cols = [ - "200", - "201", - "var1_growth_ratio", - "employees", - "reference", - ] - - # data in the column order above - input_data = [ - [3, 1, 1, 1, 1], - [2, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [3, 1, 1, 2, 1], - [2, 1, 1, 2, 1], - [1, 1, 1, 2, 1], - ] - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_sort_df(self): - """Create output data for the sort_df function""" - - # columns for the dataframe - output_cols = [ - "200", - "201", - "var1_growth_ratio", - "employees", - "reference", - ] - - # data in the column order above - output_data = [ - [1, 1, 1, 2, 1], - [1, 1, 1, 1, 1], - [2, 1, 1, 2, 1], - [2, 1, 1, 1, 1], - [3, 1, 1, 2, 1], - [3, 1, 1, 1, 1], - ] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def test_sort_df(self): - """Test the expected functionality""" - - df_input = self.input_data_sort_df() - df_expout = self.output_data_sort_df() - target_variable = "var1" - - df_result = sort_df(target_variable, df_input) - assert_frame_equal(df_result, df_expout) - - -class TestTrimCheck: - """Unit test for trim_check""" - - def input_data_trim_check_less_than_10(self): - """Create input data for the trim_check function""" - - # columns for the dataframe - input_cols = ["col1", "col2"] - - # data in the column order above - input_data = [ - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - ] # 9 rows (less than 10) - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def input_data_trim_check_equal_10(self): - """Create input data for the trim_check function""" - - # columns for the dataframe - input_cols = ["col1", "col2"] - - # data in the column order above - input_data = [ - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - ] # 10 rows (==10) - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def input_data_trim_check_more_than_10(self): - """Create input data for the trim_check function""" - - # columns for the dataframe - input_cols = ["col1", "col2"] - - # data in the column order above - input_data = [ - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - [1, 1], - ] # 11 rows (more than 10) - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_trim_check_less_than_10(self): - """Create output data for the trim_check function""" - - # columns for the dataframe - output_cols = ["col1", "col2", "trim_check"] - - # data in the column order above - output_data = [ - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - ] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def output_data_trim_check_equal_10(self): - """Create output data for the trim_check function""" - - # columns for the dataframe - output_cols = ["col1", "col2", "trim_check"] - - # data in the column order above - output_data = [ - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - [1, 1, "below_trim_threshold"], - ] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def output_data_trim_check_more_than_10(self): - """Create output data for the trim_check function""" - - # columns for the dataframe - output_cols = ["col1", "col2", "trim_check"] - - # data in the column order above - output_data = [ - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - [1, 1, "above_trim_threshold"], - ] - - # Create a pandas dataframe - df_expout = pandasDF(data=output_data, columns=output_cols) - - return df_expout - - def test_trim_check(self): - """Test the expected functionality""" - - df_input_less_than_10 = self.input_data_trim_check_less_than_10() - df_input_equal_10 = self.input_data_trim_check_equal_10() - df_input_more_than_10 = self.input_data_trim_check_more_than_10() - - df_expout_less_than_10 = self.output_data_trim_check_less_than_10() - df_expout_equal_10 = self.output_data_trim_check_equal_10() - df_expout_more_than_10 = self.output_data_trim_check_more_than_10() - - df_result_less_than_10 = trim_check(df_input_less_than_10) - df_result_equal_10 = trim_check(df_input_equal_10) - df_result_more_than_10 = trim_check(df_input_more_than_10) - - assert_frame_equal(df_expout_less_than_10, df_result_less_than_10) - assert_frame_equal(df_expout_equal_10, df_result_equal_10) - assert_frame_equal(df_expout_more_than_10, df_result_more_than_10) - - -class TestTrimBounds: - """Unit test for trim_bounds""" - - def input_data_trim_bounds(self): - """Create input data for the trim_bounds function""" - - # columns for the dataframe - input_cols = ["col1", "col2", "trim_check"] - - # data in the column order above - input_data = [ - [1, 1, "above_trim_threshold"], - [2, 1, "above_trim_threshold"], - [3, 1, "above_trim_threshold"], - [4, 1, "above_trim_threshold"], - [5, 1, "above_trim_threshold"], - ] - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_trim_bounds(self): - """Create output data for the trim_bounds function""" - - # columns for the dataframe - output_cols = ["col1", "col2", "trim_check", "trim"] - - # data in the column order above - output_data = [ - [1, 1, "above_trim_threshold", "do trim"], - [2, 1, "above_trim_threshold", "dont trim"], - [3, 1, "above_trim_threshold", "dont trim"], - [4, 1, "above_trim_threshold", "dont trim"], - [5, 1, "above_trim_threshold", "do trim"], - ] # ! would I want to remove 4th and 5 or just 4th - - # Create a pandas dataframe - output_df = pandasDF(data=output_data, columns=output_cols) - - return output_df - - def test_trim_bounds(self): - """Test the expected functionality""" - - input_df = self.input_data_trim_bounds() - expout_df = self.output_data_trim_bounds() - - df_result = trim_bounds(input_df) # add period filter functionality - assert_frame_equal(df_result, expout_df) - - -class TestGetMeanGrowthRatio: - """Unit test for get_mean_growth_ratio""" - - def input_data_get_mean_growth_ratio(self): - """Create input data for the get_mean_growth_ratio function""" - - # columns for the dataframe - input_cols = ["var1_growth_ratio", "trim"] - - # data in the column order above - input_data = [ - [1, "dont trim"], - [2, "dont trim"], - [3, "dont trim"], - [4, "dont trim"], - [5, "dont trim"], - ] - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_get_mean_growth_ratio(self): - """Create output data for the get_mean_growth_ratio function""" - - # output dict - output_dict = {"class1_var1_mean_growth_ratio and count": [3.0, 5]} - - return output_dict - - def test_get_mean_growth_ratio(self): - """Test the expected functionality""" - - input_df = self.input_data_get_mean_growth_ratio() - expout_dict = self.output_data_get_mean_growth_ratio() - # expout_df = self.output_data_get_mean_growth_ratio_df() - - result_dict = get_mean_growth_ratio( - input_df, {}, "class1", "var1" - ) # add period filter functionality - assert result_dict == expout_dict - # assert_frame_equal(results_df, expout_df) - - -class TestLoopUnique: # testing for loops run as expected - """Unit test for loop_unique""" - - def input_data_loop_unique(self): - """Create input data for the loop_unique function""" - - # columns for the dataframe - input_cols = [ - "status", - "current_period_class", - "200", - "201", - "current_period_var1", - "current_period_var2", - "previous_period_var1", - "previous_period_var2", - "employees", - "reference", - "trim", - ] - - # data in the column order above - input_data = [ - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 2, 4, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 8, 2, 2, 1, 1, "dont trim"], - ] # (more than 10 rows per class) - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols) - - return input_df - - def output_data_loop_unique(self): - """Create output data for the loop_unique function""" - - # output dict - output_dict = { - "class1_var1_mean_growth_ratio and count": [1.0, 7], - "class1_var2_mean_growth_ratio and count": [2.0, 7], - "class2_var1_mean_growth_ratio and count": [3.0, 7], - "class2_var2_mean_growth_ratio and count": [4.0, 7], - } - - return output_dict - - def test_loop_unique(self): - """Test the expected functionality""" - - input_df = self.input_data_loop_unique() - expout_dict = self.output_data_loop_unique() - # expout_df = self.output_data_loop_unique_df() - - column = "current_period_class" - target_variables_list = ["var1", "var2"] - current_period = "current_period" - previous_period = "previous_period" - - result_dict = loop_unique( - input_df, # removed , result_df - column, - target_variables_list, - current_period, - previous_period, - ) - assert result_dict == expout_dict - # assert_frame_equal(result_df, expout_df) - - -class TestForwardImputation: - """Unit test for forward_imputation""" - - def input_data_forward_imputation(self): - """Create input data for the forward_imputation function""" - - input_cols = { - "status": "str", - "current_period_class": "str", - "200": "str", - "201": "str", - "current_period_var1": "Int64", - "previous_period_var1": "Int64", - "employees": "Int64", - "reference": "Int64", - "trim": "str", - } - - input_data = [ - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", np.nan, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", np.nan, 1, 1, 1, "dont trim"], - ] # (more than 10 rows per class) - - input_df = pandasDF(data=input_data, columns=input_cols.keys()) - input_df = input_df.astype(input_cols) - - return input_df - - def output_data_forward_imputation(self): - """Create output data for the forward_imputation function""" - - output_cols = { - "status": "str", - "current_period_class": "str", - "200": "str", - "201": "str", - "current_period_var1": "Int64", - "previous_period_var1": "Int64", - "employees": "Int64", - "reference": "Int64", - "trim": "str", - "forwards_imputed_var1": "Int64", - } - - output_data = [ - ["Clear", "class1", "C", "G", np.nan, 1, 1, 1, "dont trim", 4], - ["Clear", "class2", "D", "G", np.nan, 1, 1, 1, "dont trim", 6], - ] # (more than 10 rows per class) - - output_df = pandasDF( - data=output_data, columns=output_cols.keys(), index=[11, 23] - ) - output_df = output_df.astype(output_cols) - - return output_df - - def test_forward_imputation(self): - """Test the expected functionality""" - - input_df = self.input_data_forward_imputation() - expout_dict = self.output_data_forward_imputation() - - column = "current_period_class" - target_variables_list = ["var1"] - current_period = "current_period" - previous_period = "previous_period" - - df_result = forward_imputation( - input_df, column, target_variables_list, current_period, previous_period - ) - - assert_frame_equal(df_result, expout_dict) - - -class TestBackwardsImputation: - """Unit test for backwards_imputation""" - - def input_data_backwards_imputation(self): - """Create input data for the backwards_imputation function""" - - # columns for the dataframe - input_cols = { - "status": "str", - "current_period_class": "str", - "200": "str", - "201": "str", - "current_period_var1": "Int64", - "previous_period_var1": "Int64", - "employees": "Int64", - "reference": "Int64", - "trim": "str", - } - - # data in the column order above - input_data = [ - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, 1, 1, 1, "dont trim"], - ["Clear", "class1", "C", "G", 4, np.nan, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, 1, 1, 1, "dont trim"], - ["Clear", "class2", "D", "G", 6, np.nan, 1, 1, "dont trim"], - ] # (more than 10 rows per class) - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols.keys()) - input_df = input_df.astype(input_cols) - - return input_df - - def output_data_backwards_imputation(self): - """Create output data for the backwards_imputation function""" - - # columns for the dataframe - output_cols = { - "status": "str", - "current_period_class": "str", - "200": "str", - "201": "str", - "current_period_var1": "Int64", - "previous_period_var1": "Int64", - "employees": "Int64", - "reference": "Int64", - "trim": "str", - "backwards_imputed_var1": "Int64", - } - - # data in the column order above - output_data = [ - ["Clear", "class1", "C", "G", 4, np.nan, 1, 1, "dont trim", 1], - ["Clear", "class2", "D", "G", 6, np.nan, 1, 1, "dont trim", 1], - ] # (more than 10 rows per class) - - # Create a pandas dataframe - output_df = pandasDF( - data=output_data, columns=output_cols.keys(), index=[11, 23] - ) - output_df = output_df.astype(output_cols) - - return output_df - - def test_backwards_imputation(self): - """Test the expected functionality""" - - input_df = self.input_data_backwards_imputation() - expout_df = self.output_data_backwards_imputation() - - column = "current_period_class" - target_variables_list = ["var1"] - current_period = "current_period" - previous_period = "previous_period" - - df_result = backwards_imputation( - input_df, column, target_variables_list, current_period, previous_period - ) - - assert_frame_equal(df_result, expout_df) - - -class TestRunImputation: - """Unit test for run_imputation""" - - def input_data_run_imputation(self): - """Create input data for the run_imputation function""" - # Currently input_df isn't being used as - # fake data is hard coded into - # function until ingest is firmed down - - # columns for the dataframe - input_cols = { - "status": "str", - "reference": "Int64", - "civ_or_def": "str", - "Product_group": "str", - "employees": "Int64", - "202012_var1": "Int64", - "202012_var2": "Int64", - "202009_var1": "Int64", - "202009_var2": "Int64", - } - - # data in the column order above - input_data = [ - ["Clear", 1, "2", "A", 100, 1, 1, 1, 3], - ["Clear", 2, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 3, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 4, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 5, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 6, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 7, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 8, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 9, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 10, "2", "A", 100, 11, 1, 10, 3], - ["Clear", 11, "2", "A", 100, 110, 1, 100, 3], - ["Clear", 12, "2", "A", 100, np.nan, 1, 10, 3], - ["Clear", 13, "2", "B", 100, 1, 1, 1, 3], - ["Clear", 14, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 15, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 16, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 17, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 18, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 19, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 20, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 21, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 22, "2", "B", 100, 11, 1, 10, 3], - ["Clear", 23, "2", "B", 100, 110, 1, 100, 3], - ["Clear", 24, "2", "B", 100, 11, 1, 10, np.nan], - ] # (more than 10 rows per class) - - # Create a pandas dataframe - input_df = pandasDF(data=input_data, columns=input_cols.keys()) - input_df = input_df.astype(input_cols) - - return input_df - - def output_data_run_imputation(self): - """Create output data for the run_imputation function""" - output_cols_f = { - "status": "str", - "reference": "Int64", - "200": "str", - "201": "str", - "employees": "Int64", - "202012_var1": "Int64", - "202012_var2": "Int64", - "202009_var1": "Int64", - "202009_var2": "Int64", - "202012_class": "str", - "forwards_imputed_var1": "Int64", - "forwards_imputed_var2": "Int64", - } - - output_data_for = [ - ["Clear", 12, "2", "A", 100, np.nan, 1, 10, 3, "2_A", 11, np.nan], - ] # (more than 10 rows per class) - - output_df_for = pandasDF( - data=output_data_for, columns=output_cols_f.keys(), index=[11] - ).astype(output_cols_f) - - # TODO check data types and update headers - # when using real data - # columns for the dataframe - output_cols_b = { - "status": "str", - "reference": "Int64", - "200": "str", - "201": "str", - "employees": "Int64", - "202012_var1": "Int64", - "202012_var2": "Int64", - "202009_var1": "Int64", - "202009_var2": "Int64", - "202012_class": "str", - "backwards_imputed_var1": "Int64", - "backwards_imputed_var2": "Int64", - } - - # TODO check data types and update headers - # when using real data - # data in the column order above - output_data_back = [ - ["Clear", 24, "2", "B", 100, 11, 1, 10, np.nan, "2_B", np.nan, 3], - ] # (more than 10 rows per class) - - output_df_back = pandasDF( - data=output_data_back, columns=output_cols_b.keys(), index=[23] - ).astype(output_cols_b) - - return output_df_for, output_df_back - - def test_run_imputation(self): - """Test the expected functionality""" - - input_df = self.input_data_run_imputation() - expout_df_for, expout_df_back = self.output_data_run_imputation() - - target_variables_list = ["var1", "var2"] - current_period = "202012" - previous_period = "202009" - result_for, result_back = run_imputation( - input_df, target_variables_list, current_period, previous_period - ) - pd.set_option("display.max_rows", None) - pd.set_option("display.max_columns", None) - pd.set_option("display.width", 2000) - print(result_for) - assert_frame_equal(result_for, expout_df_for) - assert_frame_equal(result_back, expout_df_back) - - -class TestUpdateImputed: - """Unit test for update_imputed""" - - def input_data_update_imputed(self): - """Create input data for the update_imputed function""" - - # columns for the dataframe - input_cols_full = [ - "reference", - "col2", - ] - - # data in the column order above - input_data_full = [ - [1.0, 1.0], - [2.0, np.nan], - ] - - # Create a pandas dataframe - input_full = pandasDF(data=input_data_full, columns=input_cols_full) - - # columns for the dataframe - input_cols_imputed = [ - "reference", - "forwards_imputed_col2", - ] - - # data in the column order above - input_data_imputed = [ - [2.0, 1.0], - ] - - # Create a pandas dataframe - input_imputed = pandasDF(data=input_data_imputed, columns=input_cols_imputed) - - return input_full, input_imputed - - def output_data_update_imputed(self): - """Create output data for the update_imputed function""" - - # columns for the dataframe - output_cols = ["reference", "col2", "imputation_marker"] - - # data in the column order above - output_data = [ - [1.0, 1.0, "response"], - [2.0, 1.0, "forwards_imputed"], - ] # (more than 10 rows per class) - - # Create a pandas dataframe - output_df = pandasDF(data=output_data, columns=output_cols) - - return output_df - - def test_update_imputed(self): - """Test the expected functionality""" - - input_full, input_imputed = self.input_data_update_imputed() - output_df = self.output_data_update_imputed() - - target_variables_list = ["col2"] - direction = "forwards" - - df_result = update_imputed( - input_full, input_imputed, target_variables_list, direction - ) - - assert_frame_equal(df_result, output_df) From 799802f936990a9113521e21cc679e7a9cbabe05 Mon Sep 17 00:00:00 2001 From: jwestw Date: Thu, 11 Jan 2024 15:40:01 +0000 Subject: [PATCH 10/26] Changes requested during joint review --- src/imputation/imputation_helpers.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py index 0e6622a41..75c1142d6 100644 --- a/src/imputation/imputation_helpers.py +++ b/src/imputation/imputation_helpers.py @@ -222,13 +222,13 @@ def fill_sf_zeros(df: pd.DataFrame) -> pd.DataFrame: def tidy_imputation_dataframe( - df: pd.DataFrame, - config: Dict, - logger, - to_impute_cols: List, - write_csv: Callable, - run_id: int, - ) -> pd.DataFrame: + df: pd.DataFrame, + config: Dict, + logger: logging.Logger, + to_impute_cols: List, + write_csv: Callable, + run_id: int, +) -> pd.DataFrame: """Remove rows and columns not needed after imputation.""" # Create lists for the qa cols imp_cols = [f"{col}_imputed" for col in to_impute_cols] @@ -247,15 +247,15 @@ def tidy_imputation_dataframe( to_drop += ["200_imp_marker", "211_trim", "305_trim", "manual_trim"] df = df.drop(columns=to_drop) - # Keep only clear and imputed records - imputed_statuses = ["TMI", "CF", "MoR", "constructed"] - to_keep = df["imp_marker"].isin(imputed_statuses) | (df["imp_marker"] == "R") + # Keep only imputed records and clear ("R") + imp_markers_to_keep = ["TMI", "CF", "MoR", "constructed"] + to_keep = df["imp_marker"].isin(imp_markers_to_keep) | (df["imp_marker"] == "R") to_keep_df = df.copy().loc[to_keep] filtered_output_df = df.copy().loc[~to_keep] # change the value of the status column to 'imputed' for imputed statuses - condition = to_keep_df["status"].isin(imputed_statuses) + condition = to_keep_df["imp_marker"].isin(imp_markers_to_keep) to_keep_df.loc[condition, "status"] = "imputed" # Running status filtered full dataframe output for QA From a01f5f201ee5661edcce873279808b1cb931f5a0 Mon Sep 17 00:00:00 2001 From: Tom Coates Date: Thu, 11 Jan 2024 15:56:56 +0000 Subject: [PATCH 11/26] fix shortform and postcode bugs in construction --- src/construction/construction.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/construction/construction.py b/src/construction/construction.py index 484ee4e51..b5f043cde 100644 --- a/src/construction/construction.py +++ b/src/construction/construction.py @@ -4,6 +4,7 @@ from typing import Callable from src.staging.validation import validate_data_with_schema +from src.staging.staging_helpers import postcode_topup from src.outputs.outputs_helpers import create_period_year construction_logger = logging.getLogger(__name__) @@ -87,6 +88,10 @@ def run_construction( updated_snapshot_df = create_period_year(updated_snapshot_df) construction_df = create_period_year(construction_df) + # Set instance=1 so longforms with status 'Form sent out' match correctly + form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out") + updated_snapshot_df.loc[form_sent_condition, "instance"] = 1 + # NI data has no instance but needs an instance of 1 if is_northern_ireland: construction_df["instance"] = 1 @@ -115,6 +120,19 @@ def run_construction( {"reference": "Int64", "instance": "Int64", "period_year": "Int64"} ) + # Long form records with a postcode in 601 use this as the postcode + long_form_cond = (~updated_snapshot_df["601"].isnull()) + updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"] + + # Short form records with nothing in 601 use referencepostcode instead + short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull()) + updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"] + + # Top up all new postcodes so they're all eight characters exactly + postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"] + for col in postcode_cols: + updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup) + construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.") return updated_snapshot_df From 79fe040faf3af483a665cc35cffbf8e5a5214228 Mon Sep 17 00:00:00 2001 From: Cheshire Date: Thu, 11 Jan 2024 16:21:41 +0000 Subject: [PATCH 12/26] RDRP-646: basic functionality added --- .pre-commit-config.yaml | 22 +++++++++++----------- src/construction/construction.py | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e7f65f642..fc5ea89b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,17 +9,17 @@ repos: args: - --extra-keys - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 - hooks: - # - id: check-added-large-files - # name: Check for files larger than 5 MB - # args: ["--maxkb=5120"] - - id: end-of-file-fixer - name: Check for a blank line at the end of scripts (auto-fixes) - exclude: '\.Rd' - - id: trailing-whitespace - name: Check for trailing whitespaces (auto-fixes) + # - repo: https://github.com/pre-commit/pre-commit-hooks + # rev: v4.0.1 + # hooks: + # # - id: check-added-large-files + # # name: Check for files larger than 5 MB + # # args: ["--maxkb=5120"] + # - id: end-of-file-fixer + # name: Check for a blank line at the end of scripts (auto-fixes) + # exclude: '\.Rd' + # - id: trailing-whitespace + # name: Check for trailing whitespaces (auto-fixes) - repo: https://github.com/pycqa/isort rev: 5.8.0 hooks: diff --git a/src/construction/construction.py b/src/construction/construction.py index 484ee4e51..dc1751eed 100644 --- a/src/construction/construction.py +++ b/src/construction/construction.py @@ -77,6 +77,9 @@ def run_construction( validate_data_with_schema(construction_df, schema_path) construction_df = construction_df.dropna(axis="columns", how="all") + # Prepare the short to long form constructions + updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df) + # Add flags to indicate whether a row was constructed or should be imputed updated_snapshot_df["is_constructed"] = False updated_snapshot_df["force_imputation"] = False @@ -115,6 +118,28 @@ def run_construction( {"reference": "Int64", "instance": "Int64", "period_year": "Int64"} ) + updated_snapshot_df = updated_snapshot_df.sort_values( + ["reference", "instance"], ascending=[True, True] + ).reset_index(drop=True) + construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.") return updated_snapshot_df + + +def prepare_short_to_long(updated_snapshot_df, construction_df): + """Create addional instances for short to long construction""" + # Check which references are going to converted to long forms + short_to_long_refs = construction_df.loc[construction_df["short_to_long"] == True, "reference"].unique() + # Create conversion df + short_to_long_df = updated_snapshot_df[updated_snapshot_df["reference"].isin(short_to_long_refs)] + + # Copy instance 0 record to create instance 1 and instance 2 + short_to_long_df1 = short_to_long_df.copy() + short_to_long_df1["instance"] = 1 + short_to_long_df2 = short_to_long_df.copy() + short_to_long_df2["instance"] = 2 + + # Add new instances to the updated snapshot df + updated_snapshot_df = pd.concat([updated_snapshot_df, short_to_long_df1, short_to_long_df2]) + return updated_snapshot_df From 306405999d6f39492a4e41204c19537e046ccc14 Mon Sep 17 00:00:00 2001 From: Cheshire Date: Thu, 11 Jan 2024 17:21:59 +0000 Subject: [PATCH 13/26] RDRP-646: pre-commits and change to flag order --- .pre-commit-config.yaml | 22 +++++++++++----------- src/construction/construction.py | 18 ++++++++++++------ src/developer_config.yaml | 9 ++++----- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc5ea89b9..e7f65f642 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,17 +9,17 @@ repos: args: - --extra-keys - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" - # - repo: https://github.com/pre-commit/pre-commit-hooks - # rev: v4.0.1 - # hooks: - # # - id: check-added-large-files - # # name: Check for files larger than 5 MB - # # args: ["--maxkb=5120"] - # - id: end-of-file-fixer - # name: Check for a blank line at the end of scripts (auto-fixes) - # exclude: '\.Rd' - # - id: trailing-whitespace - # name: Check for trailing whitespaces (auto-fixes) + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + # - id: check-added-large-files + # name: Check for files larger than 5 MB + # args: ["--maxkb=5120"] + - id: end-of-file-fixer + name: Check for a blank line at the end of scripts (auto-fixes) + exclude: '\.Rd' + - id: trailing-whitespace + name: Check for trailing whitespaces (auto-fixes) - repo: https://github.com/pycqa/isort rev: 5.8.0 hooks: diff --git a/src/construction/construction.py b/src/construction/construction.py index dc1751eed..7daf927ac 100644 --- a/src/construction/construction.py +++ b/src/construction/construction.py @@ -77,12 +77,12 @@ def run_construction( validate_data_with_schema(construction_df, schema_path) construction_df = construction_df.dropna(axis="columns", how="all") - # Prepare the short to long form constructions - updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df) - # Add flags to indicate whether a row was constructed or should be imputed updated_snapshot_df["is_constructed"] = False updated_snapshot_df["force_imputation"] = False + + # Prepare the short to long form constructions + updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df) construction_df["is_constructed"] = True # Create period_year column, except for NI which already has it @@ -130,9 +130,13 @@ def run_construction( def prepare_short_to_long(updated_snapshot_df, construction_df): """Create addional instances for short to long construction""" # Check which references are going to converted to long forms - short_to_long_refs = construction_df.loc[construction_df["short_to_long"] == True, "reference"].unique() + short_to_long_refs = construction_df.loc[ + construction_df["short_to_long"] == True,"reference" + ].unique() # Create conversion df - short_to_long_df = updated_snapshot_df[updated_snapshot_df["reference"].isin(short_to_long_refs)] + short_to_long_df = updated_snapshot_df[ + updated_snapshot_df["reference"].isin(short_to_long_refs) + ] # Copy instance 0 record to create instance 1 and instance 2 short_to_long_df1 = short_to_long_df.copy() @@ -141,5 +145,7 @@ def prepare_short_to_long(updated_snapshot_df, construction_df): short_to_long_df2["instance"] = 2 # Add new instances to the updated snapshot df - updated_snapshot_df = pd.concat([updated_snapshot_df, short_to_long_df1, short_to_long_df2]) + updated_snapshot_df = pd.concat( + [updated_snapshot_df, short_to_long_df1, short_to_long_df2] + ) return updated_snapshot_df diff --git a/src/developer_config.yaml b/src/developer_config.yaml index e07316c20..d34950dbd 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -9,10 +9,10 @@ global: # Staging and validation settings postcode_csv_check: False load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions - load_ni_data: True + load_ni_data: False load_historic_data: False - run_construction: False - run_ni_construction: True + run_construction: True + run_ni_construction: False load_manual_outliers: False load_manual_imputation: False load_backdata: True # whether to load previous year data for MoR @@ -23,7 +23,6 @@ global: # Output settings output_full_responses: False output_ni_full_responses: False - output_imputation_qa: False output_auto_outliers: False output_outlier_qa : False output_estimation_qa: False @@ -88,7 +87,7 @@ network_paths: backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv" outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers" manual_outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv" - construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_file.csv" + construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/JC_test_construction_file.csv" construction_file_path_ni: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_ni_file.csv" # construction_add_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_additions_2023-11-06_v5.csv" # TODO Need to test # construction_amend_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_amendments_2023-10-31_v2.csv" # TODO Need to test From 7fdcfa08dd87844b3235e75bbe211724b1a6c53e Mon Sep 17 00:00:00 2001 From: Griffith Date: Thu, 11 Jan 2024 17:47:45 +0000 Subject: [PATCH 14/26] update tests --- src/outputs/form_output_prep.py | 8 +- src/outputs/gb_sas.py | 15 --- src/outputs/outputs_main.py | 2 - src/outputs/tau.py | 13 -- src/staging/pg_conversion.py | 152 ++++++++++----------- src/staging/staging_main.py | 1 - tests/test_staging/test_pg_conversion.py | 160 ++++++++++++++++------- 7 files changed, 187 insertions(+), 164 deletions(-) diff --git a/src/outputs/form_output_prep.py b/src/outputs/form_output_prep.py index 0e3898484..7b2401562 100644 --- a/src/outputs/form_output_prep.py +++ b/src/outputs/form_output_prep.py @@ -1,5 +1,5 @@ import pandas as pd -from src.staging.pg_conversion import run_pg_conversion +from src.staging.pg_conversion import sic_to_pg_mapper from src.staging.validation import flag_no_rand_spenders @@ -59,9 +59,9 @@ def form_output_prep( ni_full_responses["form_status"] = 600 ni_full_responses["602"] = 100 ni_full_responses["formtype"] = "0003" - ni_full_responses = run_pg_conversion( - ni_full_responses, pg_num_alpha, sic_pg_alpha, target_col="201" - ) + + # Update column 201 (currently PG numeric) to alpha-numeric, mapping from SIC. + ni_full_responses = sic_to_pg_mapper(ni_full_responses, sic_pg_alpha) # outputs_df = pd.concat([outputs_df, ni_full_responses]) tau_outputs_df = pd.concat([tau_outputs_df, ni_full_responses]) diff --git a/src/outputs/gb_sas.py b/src/outputs/gb_sas.py index 4435a465f..73ffaad8c 100644 --- a/src/outputs/gb_sas.py +++ b/src/outputs/gb_sas.py @@ -7,7 +7,6 @@ import src.outputs.map_output_cols as map_o from src.staging.validation import load_schema from src.outputs.outputs_helpers import create_output_df, regions -from src.staging.pg_conversion import sic_to_pg_mapper GbSasLogger = logging.getLogger(__name__) @@ -19,7 +18,6 @@ def output_gb_sas( run_id: int, ultfoc_mapper: pd.DataFrame, postcode_mapper: pd.DataFrame, - sic_pg_num: pd.DataFrame, ): """Run the outputs module. @@ -31,8 +29,6 @@ def output_gb_sas( run_id (int): The current run id ultfoc_mapper (pd.DataFrame): The ULTFOC mapper DataFrame. postcode_mapper (pd.DataFrame): maps the postcode to region code - pg_alpha_num (pd.DataFrame): mapper of numeric PG to alpha PG - """ NETWORK_OR_HDFS = config["global"]["network_or_hdfs"] @@ -47,20 +43,9 @@ def output_gb_sas( # Join foriegn ownership column using ultfoc mapper df1 = map_o.join_fgn_ownership(df1, ultfoc_mapper) - # Fill in numeric PG for short forms and imputed long forms - df1 = sic_to_pg_mapper( - df1, - sic_pg_num, - target_col="pg_numeric", - from_col="SIC 2007_CODE", - to_col="2016 > Form PG", - formtype=["0006", "0001"], - ) - # Map to the CORA statuses from the statusencoded column df1 = map_o.create_cora_status_col(df1) - # Map the sizebands based on frozen employment df1 = map_o.map_sizebands(df1) diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py index c61280772..161c0be35 100644 --- a/src/outputs/outputs_main.py +++ b/src/outputs/outputs_main.py @@ -123,7 +123,6 @@ def run_outputs( run_id, ultfoc_mapper, postcode_mapper, - sic_pg_num, ) OutputMainLogger.info("Finished TAU output.") @@ -137,7 +136,6 @@ def run_outputs( run_id, ultfoc_mapper, postcode_mapper, - sic_pg_num, ) OutputMainLogger.info("Finished GB SAS output.") diff --git a/src/outputs/tau.py b/src/outputs/tau.py index e06c70a83..329ab32d3 100644 --- a/src/outputs/tau.py +++ b/src/outputs/tau.py @@ -18,7 +18,6 @@ def output_tau( run_id: int, ultfoc_mapper: pd.DataFrame, postcode_itl_mapper: pd.DataFrame, - sic_pg_num: pd.DataFrame, ): """Run the outputs module. @@ -30,8 +29,6 @@ def output_tau( run_id (int): The current run id ultfoc_mapper (pd.DataFrame): The ULTFOC mapper DataFrame. postcode_itl_mapper (pd.DataFrame): maps the postcode to region code - pg_alpha_num (pd.DataFrame): mapper of alpha PG to numeric PG - """ NETWORK_OR_HDFS = config["global"]["network_or_hdfs"] @@ -49,16 +46,6 @@ def output_tau( # Join foriegn ownership column using ultfoc mapper df = map_o.join_fgn_ownership(df, ultfoc_mapper, formtype=["0001", "0006"]) - # Fill in numeric PG for short forms and imputed long forms - df = sic_to_pg_mapper( - df, - sic_pg_num, - target_col="pg_numeric", - from_col="SIC 2007_CODE", - to_col="2016 > Form PG", - formtype=["0006", "0001", "0003"], - ) - # Map to the CORA statuses from the statusencoded column df = map_o.create_cora_status_col(df) diff --git a/src/staging/pg_conversion.py b/src/staging/pg_conversion.py index c6fc43aee..5fbca575c 100644 --- a/src/staging/pg_conversion.py +++ b/src/staging/pg_conversion.py @@ -5,24 +5,34 @@ PgLogger = logging.getLogger(__name__) -def pg_to_pg_mapper( +def sic_to_pg_mapper( df: pd.DataFrame, - mapper: pd.DataFrame, - target_col: str = "product_group", + sicmapper: pd.DataFrame, pg_column: str = "201", - from_col: str = "pg_numeric", - to_col: str = "pg_alpha", + sic_column: str = "rusic", + from_col: str = "SIC 2007_CODE", + to_col: str = "2016 > Form PG", ): - """This function maps all values in one column to another column - using a mapper file. This is applied to long forms only. - The default this is used for is PG numeric to letter conversion. + """Map from SIC code to PG numeric code where PG numeric is null. + + Example initial dataframe: + reference | 201 | rusic + -------------------------------- + 1 | 53 | 2500 + 2 | NaN | 1600 + 3 | NaN | 4300 + + returned dataframe: + reference | 201 | rusic + -------------------------------- + 1 | 53 | 2500 + 2 | 45 | 1600 + 3 | 38 | 4300 Args: - df (pd.DataFrame): The dataset containing all the PG numbers - mapper (pd.DataFrame): The mapper dataframe loaded using custom function - target_col (str, optional): The column we output the - mapped values to (product_group). - pg_column (str, optional): The column we want to convert (201). + df (pd.DataFrame): The dataset containing all the PG numbers. + sicmapper (pd.DataFrame): The SIC to pg numeric mapper. + sic_column (str, optional): The column containing the SIC numbers. from_col (str, optional): The column in the mapper that is used to map from. to_col (str, optional): The column in the mapper that is used to map to. @@ -30,15 +40,11 @@ def pg_to_pg_mapper( pd.DataFrame: A dataframe with all target column values mapped """ - filtered_df = df.copy() - - if "formtype" in filtered_df.columns: - formtype_cond = filtered_df["formtype"] == "0001" - filtered_df = filtered_df[formtype_cond] + df = df.copy() # Create a mapping dictionary from the 2 columns - map_dict = dict(zip(mapper[from_col], mapper[to_col])) - # Flag all PGs that don't have a corresponding map value + map_dict = dict(zip(sicmapper[from_col], sicmapper[to_col])) + # Flag all SIC numbers that don't have a corresponding map value mapless_errors = [] for key, value in map_dict.items(): if str(value) == "nan": @@ -46,45 +52,49 @@ def pg_to_pg_mapper( if mapless_errors: PgLogger.error( - f"Mapping doesnt exist for the following product groups: {mapless_errors}" + f"Mapping doesnt exist for the following SIC numbers: {mapless_errors}" ) - # Map using the dictionary taking into account the null values. - # Then convert to categorigal datatype - filtered_df[pg_column] = pd.to_numeric(filtered_df[pg_column], errors="coerce") - filtered_df[target_col] = filtered_df[pg_column].map(map_dict) - filtered_df[target_col] = filtered_df[target_col].astype("category") + # Map to the target column using the dictionary, null values only + df.loc[df[pg_column].isnull(), pg_column] = ( + df.loc[df[pg_column].isnull(), sic_column].map(map_dict) + ) - df.loc[ - filtered_df.index, - f"{target_col}", - ] = filtered_df[target_col] - - PgLogger.info("Product groups successfully mapped to letters") + PgLogger.info("Product group nulls successfully mapped from SIC.") return df -def sic_to_pg_mapper( +def pg_to_pg_mapper( df: pd.DataFrame, - sicmapper: pd.DataFrame, - target_col: str = "product_group", - sic_column: str = "rusic", - from_col: str = "sic", + mapper: pd.DataFrame, + pg_column: str = "201", + from_col: str = "pg_numeric", to_col: str = "pg_alpha", - formtype: str = ["0006"], ): - """This function maps all values in one column to another column - using a mapper file. This is only applied for short forms and unsampled - refs. + """Map from PG numeric to PG alpha-numeric and create a new column. + + The product group column (default: column 201) coped to a new column, "pg_numeric", + and then is updated from numeric to alpha-numeric using a mapping. + + Example initial dataframe: + reference | 201 + ---------------------- + 1 | 53 + 2 | 43 + 3 | 33 + + returned dataframe: + reference | 201 | pg_numeric + ------------------------------------ + 1 | AA | 33 + 2 | B | 43 + 3 | E | 53 - The default this is used for is PG numeric to letter conversion. Args: - df (pd.DataFrame): The dataset containing all the PG numbers. - sicmapper (pd.DataFrame): The mapper dataframe loaded using custom function. - target_col (str, optional): The column we output the - mapped values to (product_group). - sic_column (str, optional): The column containing the SIC numbers. + df (pd.DataFrame): The dataframe requiring mapping + mapper (pd.DataFrame): the PG numeric to alpha-numeric mapper + pg_column (str, optional): The column we want to convert (default 201). from_col (str, optional): The column in the mapper that is used to map from. to_col (str, optional): The column in the mapper that is used to map to. @@ -92,16 +102,15 @@ def sic_to_pg_mapper( pd.DataFrame: A dataframe with all target column values mapped """ - filtered_df = df.copy() - - filtered_df = filtered_df[filtered_df["formtype"].isin(formtype)] + df = df.copy() - if "pg_numeric" in filtered_df.columns: - filtered_df = filtered_df[filtered_df["pg_numeric"].isnull()] + # Copy the numeric PG column to a new column + df["pg_numeric"] = df[pg_column].copy() # Create a mapping dictionary from the 2 columns - map_dict = dict(zip(sicmapper[from_col], sicmapper[to_col])) - # Flag all SIC numbers that don't have a corresponding map value + map_dict = dict(zip(mapper[from_col], mapper[to_col])) + + # Flag all PGs that don't have a corresponding map value mapless_errors = [] for key, value in map_dict.items(): if str(value) == "nan": @@ -109,22 +118,15 @@ def sic_to_pg_mapper( if mapless_errors: PgLogger.error( - f"Mapping doesnt exist for the following SIC numbers: {mapless_errors}" + f"Mapping doesnt exist for the following product groups: {mapless_errors}" ) - # Map to the target column using the dictionary taking into account the null values. - # Then convert to categorigal datatype - filtered_df[sic_column] = pd.to_numeric(filtered_df[sic_column], errors="coerce") - filtered_df[target_col] = filtered_df[sic_column].map(map_dict) - filtered_df[target_col] = filtered_df[target_col].astype("category") - df = df.copy() + df[pg_column] = df[pg_column].map(map_dict) - df.loc[ - filtered_df.index, - f"{target_col}", - ] = filtered_df[target_col] + # Then convert the pg column and the new column to categorigal datatypes + df = df.astype({pg_column: "category", "pg_numeric": "category"}) - PgLogger.info("SIC numbers successfully mapped to PG letters") + PgLogger.info("Numeric product groups successfully mapped to letters.") return df @@ -147,22 +149,10 @@ def run_pg_conversion( Returns: (pd.DataFrame): Dataframe with mapped values """ + # Where the + df = sic_to_pg_mapper(df, sic_pg_alpha, ) - df["pg_numeric"] = df["201"].copy() - - if target_col == "201": - target_col = "201_mapping" - else: - # Create a new column to store PGs - df[target_col] = np.nan - - # SIC mapping for short forms - df = sic_to_pg_mapper(df, sic_pg_alpha, target_col=target_col) - - # SIC mapping for NI - df = sic_to_pg_mapper(df, sic_pg_alpha, target_col=target_col, formtype=["0003"]) - - # PG mapping for long forms + # PG numeric to alpha_numeric mapping for long forms df = pg_to_pg_mapper(df, pg_num_alpha, target_col=target_col) # Overwrite the 201 column if target_col = 201 diff --git a/src/staging/staging_main.py b/src/staging/staging_main.py index 8da3cbffd..6d072a475 100644 --- a/src/staging/staging_main.py +++ b/src/staging/staging_main.py @@ -217,7 +217,6 @@ def run_staging( backdata = pg.pg_to_pg_mapper( backdata, pg_num_alpha, - target_col="q201", pg_column="q201", ) StagingMainLogger.info("Backdata File Loaded Successfully...") diff --git a/tests/test_staging/test_pg_conversion.py b/tests/test_staging/test_pg_conversion.py index a77c2b9f5..0fa74af0c 100644 --- a/tests/test_staging/test_pg_conversion.py +++ b/tests/test_staging/test_pg_conversion.py @@ -8,72 +8,136 @@ @pytest.fixture -def dummy_data() -> pd.DataFrame: +def sic_dummy_data() -> pd.DataFrame: # Set up the dummyinput data - data = pd.DataFrame( - {"201": [0, 1, 2, 3, 4], "formtype": ["0001", "0001", "0001", "0001", "0001"]} - ) - return data + columns = ["201", "rusic"] + data = [ + [53, 2500], + [np.nan, 1600], + [np.nan, 4300], + ] + + return pd.DataFrame(data, columns=columns) @pytest.fixture -def mapper() -> pd.DataFrame: - # Set up the dummy mapper data - mapper = { - "pg_numeric": [0, 1, 2, 3, 4], - "pg_alpha": [np.nan, "A", "B", "C", "C"], - } - return pd.DataFrame(mapper) +def sic_mapper(): + columns = ["sic", "pg"] + mapper_rows = [ + [1600, 36], + [2500, 95], + [7300, 45], + [2500, 53], + ] + + # Create the DataFrame + return pd.DataFrame(mapper_rows, columns=columns) @pytest.fixture -def expected_output() -> pd.DataFrame: +def sic_expected_output() -> pd.DataFrame: # Set up the dummy output data - expected_output = pd.DataFrame( - { - "201": [np.nan, "A", "B", "C", "C"], - "formtype": ["0001", "0001", "0001", "0001", "0001"], - } - ) + columns = ["201", "rusic"] + data = [ + [53, 2500], + [36, 1600], + [np.nan, 4300], + ] - expected_output["201"] = expected_output["201"].astype("category") - return expected_output + return pd.DataFrame(data, columns=columns) -@pytest.fixture -def sic_dummy_data() -> pd.DataFrame: - # Set up the dummyinput data - data = pd.DataFrame( - {"rusic": [1110, 10101], "201": [np.nan, np.nan], "formtype": ["0006", "0006"]} - ) - return data +def test_sic_mapper(sic_dummy_data, sic_expected_output, sic_mapper): + """Tests for pg mapper function.""" + expected_output_data = sic_expected_output -@pytest.fixture -def sic_mapper() -> pd.DataFrame: - # Set up the dummy mapper data - mapper = { - "sic": [1110, 10101], - "pg_alpha": ["A", "B"], - } - return pd.DataFrame(mapper) + df_result = sic_to_pg_mapper( + sic_dummy_data, + sic_mapper, + pg_column="201", + from_col="sic", + to_col="pg", + ) + + pd.testing.assert_frame_equal(df_result, expected_output_data) @pytest.fixture -def sic_expected_output() -> pd.DataFrame: - # Set up the dummy output data - expected_output = pd.DataFrame( - {"rusic": [1110, 10101], "201": ["A", "B"], "formtype": ["0006", "0006"]} - ) - expected_output["201"] = expected_output["201"].astype("category") - return expected_output +def mapper(): + mapper_rows = [ + [36, "N"], + [37, "Y"], + [45, "AC"], + [47, "AD"], + [49, "AD"], + [50, "AD"], + [58, "AH"], + ] + columns = ["pg_numeric", "pg_alpha"] + # Create the DataFrame + mapper_df = pd.DataFrame(mapper_rows, columns=columns) -def test_sic_mapper(sic_dummy_data, sic_expected_output, sic_mapper): - """Tests for pg mapper function.""" + # Return the DataFrame + return mapper_df - expected_output_data = sic_expected_output - df_result = sic_to_pg_mapper(sic_dummy_data, sic_mapper, target_col="201") +def test_pg_to_pg_mapper_with_many_to_one(mapper): - pd.testing.assert_frame_equal(df_result, expected_output_data) + columns = ["formtype", "201", "other_col"] + row_data = [ + ["0001", 45, "2020"], + ["0001", 49, "2020"], + ["0002", 50, "2020"] + ] + + test_df = pd.DataFrame(row_data, columns=columns) + + expected_columns = ["formtype", "201", "other_col", "pg_numeric"] + + expected_data = [ + ["0001", "AC", "2020", 45], + ["0001", "AD", "2020", 49], + ["0002", "AD", "2020", 50] + ] + + type_dict = {"201": "category", "pg_numeric": "category"} + + # Build the expected result dataframe. Set the dtype of prod group to cat, like the result_df + expected_result_df = pd.DataFrame(expected_data, columns=expected_columns) + expected_result_df = expected_result_df.astype(type_dict) + + result_df = pg_to_pg_mapper(test_df.copy(), mapper.copy()) + + pd.testing.assert_frame_equal(result_df, expected_result_df, check_dtype=False) + + +def test_pg_to_pg_mapper_success(mapper): + columns = ["formtype", "201", "other_col"] + row_data = [ + ["0001", 36, "2020"], + ["0001", 45, "2020"], + ["0002", 58, "2020"], + ["0001", 49, "2020"], + ] + + test_df = pd.DataFrame(row_data, columns=columns) + + expected_columns = ["formtype", "201", "other_col", "pg_numeric"] + expected_data = [ + ["0001", "N", "2020", 36], + ["0001", "AC", "2020", 45], + ["0002", "AH", "2020", 58], + ["0001", "AD", "2020", 49], + ] + + expected_result_df = pd.DataFrame( + expected_data, columns=expected_columns) + + type_dict = {"201": "category", "pg_numeric": "category"} + expected_result_df = expected_result_df.astype(type_dict) + + result_df = pg_to_pg_mapper(test_df.copy(), mapper.copy()) + + pd.testing.assert_frame_equal(result_df, expected_result_df) From 53bb0944102de8a4fff388a83d5ec4bf6606b92b Mon Sep 17 00:00:00 2001 From: Griffith Date: Thu, 11 Jan 2024 18:38:00 +0000 Subject: [PATCH 15/26] move pg_conversion to imputation --- src/imputation/imputation_main.py | 24 ++++++++++++++++++-- src/{staging => imputation}/pg_conversion.py | 16 ++++--------- src/imputation/tmi_imputation.py | 13 +++-------- src/outputs/form_output_prep.py | 10 +++++--- src/outputs/ni_sas.py | 2 +- src/outputs/outputs_main.py | 4 ++-- src/outputs/tau.py | 1 - src/pipeline.py | 5 ++-- tests/test_staging/test_pg_conversion.py | 2 +- 9 files changed, 44 insertions(+), 33 deletions(-) rename src/{staging => imputation}/pg_conversion.py (91%) diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py index a023f982c..34a7172d7 100644 --- a/src/imputation/imputation_main.py +++ b/src/imputation/imputation_main.py @@ -7,6 +7,7 @@ from src.imputation import imputation_helpers as hlp from src.imputation import tmi_imputation as tmi from src.staging.validation import load_schema +from src.imputation.pg_conversion import run_pg_conversion, pg_to_pg_mapper from src.imputation.apportionment import run_apportionment from src.imputation.short_to_long import run_short_to_long from src.imputation.MoR import run_mor @@ -21,7 +22,8 @@ def run_imputation( df: pd.DataFrame, manual_trimming_df: pd.DataFrame, - mapper: pd.DataFrame, + pg_num_alpha: pd.DataFrame, + sic_pg_num: pd.DataFrame, backdata: pd.DataFrame, config: Dict[str, Any], write_csv: Callable, @@ -48,6 +50,11 @@ def run_imputation( Returns: pd.DataFrame: dataframe with the imputed columns updated """ + # Carry out product group conversion + df = run_pg_conversion( + df, pg_num_alpha, sic_pg_num, pg_column="201" + ) + # Apportion cols 4xx and 5xx to create FTE and headcount values df = run_apportionment(df) @@ -92,11 +99,24 @@ def run_imputation( # Run MoR if backdata is not None: + # Fix for different column names on network vs hdfs + if NETWORK_OR_HDFS == "network": + # Map PG numeric to alpha in column q201 + # This isn't done on HDFS as the column is already mapped + backdata = pg_to_pg_mapper( + backdata, + pg_num_alpha, + pg_column="q201", + from_col= "pg_numeric", + to_col="pg_alpha", + ) + backdata = backdata.drop("pg_numeric", axis=1) + lf_target_vars = config["imputation"]["lf_target_vars"] df, links_df = run_mor(df, backdata, to_impute_cols, lf_target_vars, config) # Run TMI for long forms and short forms - imputed_df, qa_df = tmi.run_tmi(df, mapper, config) + imputed_df, qa_df = tmi.run_tmi(df, config) # After imputation, correction to ignore the "604" == "No" in any records with # Status "check needed" diff --git a/src/staging/pg_conversion.py b/src/imputation/pg_conversion.py similarity index 91% rename from src/staging/pg_conversion.py rename to src/imputation/pg_conversion.py index 5fbca575c..4649096a9 100644 --- a/src/staging/pg_conversion.py +++ b/src/imputation/pg_conversion.py @@ -134,8 +134,8 @@ def pg_to_pg_mapper( def run_pg_conversion( df: pd.DataFrame, pg_num_alpha: pd.DataFrame, - sic_pg_alpha: pd.DataFrame, - target_col: str = "201", + sic_pg_num: pd.DataFrame, + pg_column: str = "201", ): """Run the product group mapping functions and return a dataframe with the correct mapping for each formtype. @@ -143,21 +143,15 @@ def run_pg_conversion( Args: df (pd.DataFrame): Dataframe of full responses data mapper (pd.DataFrame): The mapper file used for PG conversion - target_col (str, optional): The column to be created - which stores mapped values. + pg_column: The original product group column Returns: (pd.DataFrame): Dataframe with mapped values """ # Where the - df = sic_to_pg_mapper(df, sic_pg_alpha, ) + df = sic_to_pg_mapper(df, sic_pg_num, pg_column) # PG numeric to alpha_numeric mapping for long forms - df = pg_to_pg_mapper(df, pg_num_alpha, target_col=target_col) - - # Overwrite the 201 column if target_col = 201 - if target_col == "201_mapping": - df["201"] = df[target_col] - df = df.drop(columns=[target_col]) + df = pg_to_pg_mapper(df, pg_num_alpha, pg_column) return df diff --git a/src/imputation/tmi_imputation.py b/src/imputation/tmi_imputation.py index ecd170875..c3ea7eaff 100644 --- a/src/imputation/tmi_imputation.py +++ b/src/imputation/tmi_imputation.py @@ -3,7 +3,7 @@ import numpy as np from typing import Dict, List, Tuple, Any -from src.staging.pg_conversion import sic_to_pg_mapper +from src.imputation.pg_conversion import sic_to_pg_mapper from src.imputation.impute_civ_def import impute_civil_defence from src.imputation import expansion_imputation as ximp @@ -425,7 +425,6 @@ def calculate_totals(df): def run_longform_tmi( longform_df: pd.DataFrame, - sic_mapper: pd.DataFrame, config: Dict[str, Any], ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Function to run imputation end to end and returns the final @@ -434,7 +433,6 @@ def run_longform_tmi( Args: longform_df (pd.DataFrame): the dataset filtered for long form entries target_variables (list): key variables - sic_mapper (pd.DataFrame): dataframe with sic mapper info config (Dict): the configuration settings Returns: final_df: dataframe with the imputed valued added @@ -442,10 +440,7 @@ def run_longform_tmi( qa_df: qa dataframe """ TMILogger.info("Starting TMI long form imputation.") - - # TMI Step 1: impute the Product Group - df = impute_pg_by_sic(longform_df, sic_mapper) - + df = longform_df.copy() # TMI Step 2: impute for R&D type (civil or defence) df = impute_civil_defence(df) @@ -520,7 +515,6 @@ def run_shortform_tmi( def run_tmi( full_df: pd.DataFrame, - sic_mapper: pd.DataFrame, config: Dict[str, Any], ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Function to run imputation end to end and returns the final @@ -528,7 +522,6 @@ def run_tmi( dataframe back to the pipeline Args: full_df (pd.DataFrame): the full responses spp dataframe - sic_mapper (pd.DataFrame): dataframe with sic to product group mapper info config (Dict): the configuration settings Returns: final_df(pd.DataFrame): dataframe with the imputed valued added and counts columns @@ -553,7 +546,7 @@ def run_tmi( excluded_df = full_df.copy().loc[mor_mask] # apply TMI imputation to long forms and then short forms - longform_tmi_df, qa_df_long = run_longform_tmi(longform_df, sic_mapper, config) + longform_tmi_df, qa_df_long = run_longform_tmi(longform_df, config) shortform_tmi_df, qa_df_short = run_shortform_tmi(shortform_df, config) diff --git a/src/outputs/form_output_prep.py b/src/outputs/form_output_prep.py index 7b2401562..4ac885b41 100644 --- a/src/outputs/form_output_prep.py +++ b/src/outputs/form_output_prep.py @@ -1,5 +1,5 @@ import pandas as pd -from src.staging.pg_conversion import sic_to_pg_mapper +from src.imputation.pg_conversion import run_pg_conversion from src.staging.validation import flag_no_rand_spenders @@ -8,7 +8,7 @@ def form_output_prep( weighted_df: pd.DataFrame, ni_full_responses: pd.DataFrame, pg_num_alpha: pd.DataFrame, - sic_pg_alpha: pd.DataFrame, + sic_pg_num: pd.DataFrame, ): """Prepares the data for the outputs. @@ -61,7 +61,11 @@ def form_output_prep( ni_full_responses["formtype"] = "0003" # Update column 201 (currently PG numeric) to alpha-numeric, mapping from SIC. - ni_full_responses = sic_to_pg_mapper(ni_full_responses, sic_pg_alpha) + ni_full_responses = run_pg_conversion( + ni_full_responses, + pg_num_alpha, + sic_pg_num + ) # outputs_df = pd.concat([outputs_df, ni_full_responses]) tau_outputs_df = pd.concat([tau_outputs_df, ni_full_responses]) diff --git a/src/outputs/ni_sas.py b/src/outputs/ni_sas.py index b9ea85285..538dcf9f7 100644 --- a/src/outputs/ni_sas.py +++ b/src/outputs/ni_sas.py @@ -6,7 +6,7 @@ import src.outputs.map_output_cols as map_o from src.staging.validation import load_schema from src.outputs.outputs_helpers import create_output_df -from src.staging.pg_conversion import sic_to_pg_mapper +from src.imputation.pg_conversion import sic_to_pg_mapper OutputMainLogger = logging.getLogger(__name__) diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py index 161c0be35..1de77450b 100644 --- a/src/outputs/outputs_main.py +++ b/src/outputs/outputs_main.py @@ -58,7 +58,7 @@ def run_outputs( civil_defence_detailed (pd.DataFrame): Detailed descriptons of civil/defence sic_division_detailed (pd.DataFrame): Detailed descriptons of SIC divisions pg_num_alpha (pd.DataFrame): Mapper for product group conversions (num to alpha) - sic_pg_alpha (pd.DataFrame): Mapper for product group conversions (SIC to alpha) + sic_pg_num (pd.DataFrame): Mapper for product group conversions """ ( @@ -71,7 +71,7 @@ def run_outputs( weighted_df, ni_full_responses, pg_num_alpha, - sic_pg_alpha, + sic_pg_num, ) # Running status filtered full dataframe output for QA diff --git a/src/outputs/tau.py b/src/outputs/tau.py index 329ab32d3..02e7ed11b 100644 --- a/src/outputs/tau.py +++ b/src/outputs/tau.py @@ -6,7 +6,6 @@ import src.outputs.map_output_cols as map_o from src.staging.validation import load_schema from src.outputs.outputs_helpers import create_output_df -from src.staging.pg_conversion import sic_to_pg_mapper OutputMainLogger = logging.getLogger(__name__) diff --git a/src/pipeline.py b/src/pipeline.py index 81ded7174..83f9cccad 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -137,7 +137,8 @@ def run_pipeline(start, config_path): imputed_df = run_imputation( full_responses, manual_trimming_df, - sic_pg_alpha, + pg_num_alpha, + sic_pg_num, backdata, config, write_csv, @@ -196,7 +197,7 @@ def run_pipeline(start, config_path): civil_defence_detailed, sic_division_detailed, pg_num_alpha, - sic_pg_alpha, + sic_pg_num, ) MainLogger.info("Finished All Output modules.") diff --git a/tests/test_staging/test_pg_conversion.py b/tests/test_staging/test_pg_conversion.py index 0fa74af0c..d39418fd7 100644 --- a/tests/test_staging/test_pg_conversion.py +++ b/tests/test_staging/test_pg_conversion.py @@ -4,7 +4,7 @@ import pytest import numpy as np -from src.staging.pg_conversion import pg_to_pg_mapper, sic_to_pg_mapper +from src.imputation.pg_conversion import pg_to_pg_mapper, sic_to_pg_mapper @pytest.fixture From 8b0176accc500d0174923aa2face4ffb90eb350e Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 10:09:19 +0000 Subject: [PATCH 16/26] 648 minor changes --- src/imputation/pg_conversion.py | 15 ++++++++------- src/staging/staging_main.py | 15 --------------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/src/imputation/pg_conversion.py b/src/imputation/pg_conversion.py index 4649096a9..76bcf3dd1 100644 --- a/src/imputation/pg_conversion.py +++ b/src/imputation/pg_conversion.py @@ -73,8 +73,10 @@ def pg_to_pg_mapper( ): """Map from PG numeric to PG alpha-numeric and create a new column. - The product group column (default: column 201) coped to a new column, "pg_numeric", - and then is updated from numeric to alpha-numeric using a mapping. + The mapper used is from a file named pg_num_alpha.csv + + The product group column (default: column 201) is copied to a new column, + "pg_numeric", and then the original column is mapped from numeric to alpha-numeric. Example initial dataframe: reference | 201 @@ -137,18 +139,17 @@ def run_pg_conversion( sic_pg_num: pd.DataFrame, pg_column: str = "201", ): - """Run the product group mapping functions and return a - dataframe with the correct mapping for each formtype. + """Run the product group (PG) mapping functions. Args: df (pd.DataFrame): Dataframe of full responses data - mapper (pd.DataFrame): The mapper file used for PG conversion - pg_column: The original product group column + pg_num_alpha (pd.DataFrame): Mapper from numeric to alpha-numeric PG. + pg_column: The original product group column, default 201 Returns: (pd.DataFrame): Dataframe with mapped values """ - # Where the + # Where product group is null, map it from SIC. df = sic_to_pg_mapper(df, sic_pg_num, pg_column) # PG numeric to alpha_numeric mapping for long forms diff --git a/src/staging/staging_main.py b/src/staging/staging_main.py index 6d072a475..383c18d14 100644 --- a/src/staging/staging_main.py +++ b/src/staging/staging_main.py @@ -8,7 +8,6 @@ # Our own modules from src.staging import validation as val -from src.staging import pg_conversion as pg import src.staging.staging_helpers as helpers @@ -210,15 +209,6 @@ def run_staging( # backdata_path, "./config/backdata_schema.toml" # ) - # Fix for different column names on network vs hdfs - if network_or_hdfs == "network": - # Map PG numeric to alpha in column q201 - # This isn't done on HDFS as the column is already mapped - backdata = pg.pg_to_pg_mapper( - backdata, - pg_num_alpha, - pg_column="q201", - ) StagingMainLogger.info("Backdata File Loaded Successfully...") else: backdata = None @@ -286,11 +276,6 @@ def run_staging( mapper_path = paths["mapper_path"] write_csv(f"{mapper_path}/sic_pg_num.csv", sic_pg_utf_mapper) - # Map PG from SIC/PG numbers to column '201'. - full_responses = pg.run_pg_conversion( - full_responses, pg_num_alpha, sic_pg_alpha_mapper, target_col="201" - ) - pg_detailed_mapper = helpers.load_valdiate_mapper( "pg_detailed_mapper_path", paths, From eb637e2ad47cd176db2341e2de405e0fcb94a2c2 Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 10:42:33 +0000 Subject: [PATCH 17/26] add exception if mapper not working --- src/imputation/pg_conversion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/imputation/pg_conversion.py b/src/imputation/pg_conversion.py index 76bcf3dd1..fa6d0556b 100644 --- a/src/imputation/pg_conversion.py +++ b/src/imputation/pg_conversion.py @@ -54,6 +54,8 @@ def sic_to_pg_mapper( PgLogger.error( f"Mapping doesnt exist for the following SIC numbers: {mapless_errors}" ) + raise Exception("Errors in the SIC to PG numeric mapper.") + # Map to the target column using the dictionary, null values only df.loc[df[pg_column].isnull(), pg_column] = ( df.loc[df[pg_column].isnull(), sic_column].map(map_dict) @@ -122,6 +124,7 @@ def pg_to_pg_mapper( PgLogger.error( f"Mapping doesnt exist for the following product groups: {mapless_errors}" ) + raise Exception("Errors in the PG numeric to alpha-numeric mapper.") df[pg_column] = df[pg_column].map(map_dict) From 58e7e578e8f9c51c78c7637c5dbe14a17c6609cf Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 10:46:39 +0000 Subject: [PATCH 18/26] remove duplicate line from config --- src/developer_config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 7408666a9..a771aade6 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -27,7 +27,6 @@ global: output_auto_outliers: False output_outlier_qa : False output_estimation_qa: False - output_imputation_qa: False output_apportionment_qa: False output_long_form: False output_short_form: False From 86a91e3fceed8aa9bb6ad00fa46dd97ccb8acc1a Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 11:39:45 +0000 Subject: [PATCH 19/26] remove unnecessary pg conversion from NI sas --- src/outputs/ni_sas.py | 16 +--------------- src/outputs/outputs_main.py | 2 -- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/outputs/ni_sas.py b/src/outputs/ni_sas.py index 538dcf9f7..717f76854 100644 --- a/src/outputs/ni_sas.py +++ b/src/outputs/ni_sas.py @@ -6,7 +6,7 @@ import src.outputs.map_output_cols as map_o from src.staging.validation import load_schema from src.outputs.outputs_helpers import create_output_df -from src.imputation.pg_conversion import sic_to_pg_mapper +from src.imputation.pg_conversion import run_pg_conversion OutputMainLogger = logging.getLogger(__name__) @@ -16,8 +16,6 @@ def output_ni_sas( config: Dict[str, Any], write_csv: Callable, run_id: int, - sic_pg_num: pd.DataFrame, - postcode_itl_mapper: pd.DataFrame, ): """Run the outputs module. @@ -39,18 +37,6 @@ def output_ni_sas( paths = config[f"{NETWORK_OR_HDFS}_paths"] output_path = paths["output_path"] - # Prepare the columns needed for outputs: - - # Fill in numeric PG where missing - df = sic_to_pg_mapper( - df, - sic_pg_num, - target_col="pg_numeric", - from_col="SIC 2007_CODE", - to_col="2016 > Form PG", - formtype=["0003"], - ) - # Map the sizebands based on frozen employment df = map_o.map_sizebands(df) diff --git a/src/outputs/outputs_main.py b/src/outputs/outputs_main.py index 1de77450b..5bc3556fe 100644 --- a/src/outputs/outputs_main.py +++ b/src/outputs/outputs_main.py @@ -147,8 +147,6 @@ def run_outputs( config, write_csv, run_id, - sic_pg_num, - postcode_mapper, ) OutputMainLogger.info("Finished NI SAS output.") From 2309d54ed28057887b97f237eb1e283ad957f07c Mon Sep 17 00:00:00 2001 From: George Zorinyants Date: Mon, 15 Jan 2024 11:58:40 +0000 Subject: [PATCH 20/26] Postcode top up returns an empty string when the postcode is empty --- src/developer_config.yaml | 4 ++-- src/staging/staging_helpers.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index e07316c20..90ed0eb16 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -9,7 +9,7 @@ global: # Staging and validation settings postcode_csv_check: False load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions - load_ni_data: True + load_ni_data: False load_historic_data: False run_construction: False run_ni_construction: True @@ -38,7 +38,7 @@ global: output_intram_by_itl1: False output_intram_by_civil_defence: False output_intram_by_sic: False - output_status_filtered: False + output_status_filtered: True output_fte_total_qa: False years: current_year: 2022 # TODO: put this in the userconfig diff --git a/src/staging/staging_helpers.py b/src/staging/staging_helpers.py index 0cb87b58e..4d53efa52 100644 --- a/src/staging/staging_helpers.py +++ b/src/staging/staging_helpers.py @@ -33,8 +33,7 @@ def postcode_topup(mystr: str, target_len: int = 8) -> str: spaces and cuts the tail on the right. If there is only one part, keeps the first 8 characters and tops it up with spaces on the right if needed. - Empty input string would have zero parts and will return a string of - eight spaces. + Empty input string would have zero parts and will return an empty string. Args: mystr (str): Input postcode. @@ -69,7 +68,7 @@ def postcode_topup(mystr: str, target_len: int = 8) -> str: return (part1 + part2)[:target_len] else: - return mystr[:target_len].ljust(target_len, " ") + return "" def fix_anon_data(responses_df, config): From f89e860395c443cee517fdb8bd6eb2faf8e43cf2 Mon Sep 17 00:00:00 2001 From: Cheshire Date: Mon, 15 Jan 2024 13:53:54 +0000 Subject: [PATCH 21/26] RDRP-646: move short to long to only run on GB --- src/construction/construction.py | 8 ++++---- src/developer_config.yaml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/construction/construction.py b/src/construction/construction.py index 7daf927ac..939899bd4 100644 --- a/src/construction/construction.py +++ b/src/construction/construction.py @@ -80,13 +80,13 @@ def run_construction( # Add flags to indicate whether a row was constructed or should be imputed updated_snapshot_df["is_constructed"] = False updated_snapshot_df["force_imputation"] = False - - # Prepare the short to long form constructions - updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df) construction_df["is_constructed"] = True - # Create period_year column, except for NI which already has it + # Run GB specific actions if not is_northern_ireland: + # Prepare the short to long form constructions (N/A to NI) + updated_snapshot_df = prepare_short_to_long(updated_snapshot_df, construction_df) + # Create period_year column (NI already has it) updated_snapshot_df = create_period_year(updated_snapshot_df) construction_df = create_period_year(construction_df) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index d34950dbd..066c5f58f 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -11,7 +11,7 @@ global: load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions load_ni_data: False load_historic_data: False - run_construction: True + run_construction: False run_ni_construction: False load_manual_outliers: False load_manual_imputation: False From 079b8192553efb7b00acd1adbef00475c6624155 Mon Sep 17 00:00:00 2001 From: Cheshire Date: Mon, 15 Jan 2024 14:34:44 +0000 Subject: [PATCH 22/26] RDRP-646: moved postcode function to GB only --- src/construction/construction.py | 33 ++++++++++++++++---------------- src/developer_config.yaml | 3 ++- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/construction/construction.py b/src/construction/construction.py index c8aae69da..126e04a34 100644 --- a/src/construction/construction.py +++ b/src/construction/construction.py @@ -90,10 +90,9 @@ def run_construction( # Create period_year column (NI already has it) updated_snapshot_df = create_period_year(updated_snapshot_df) construction_df = create_period_year(construction_df) - - # Set instance=1 so longforms with status 'Form sent out' match correctly - form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out") - updated_snapshot_df.loc[form_sent_condition, "instance"] = 1 + # Set instance=1 so longforms with status 'Form sent out' match correctly + form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out") + updated_snapshot_df.loc[form_sent_condition, "instance"] = 1 # NI data has no instance but needs an instance of 1 if is_northern_ireland: @@ -123,18 +122,20 @@ def run_construction( {"reference": "Int64", "instance": "Int64", "period_year": "Int64"} ) - # Long form records with a postcode in 601 use this as the postcode - long_form_cond = (~updated_snapshot_df["601"].isnull()) - updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"] - - # Short form records with nothing in 601 use referencepostcode instead - short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull()) - updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"] - - # Top up all new postcodes so they're all eight characters exactly - postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"] - for col in postcode_cols: - updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup) + # Run GB specific actions + if not is_northern_ireland: + # Long form records with a postcode in 601 use this as the postcode + long_form_cond = (~updated_snapshot_df["601"].isnull()) + updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"] + + # Short form records with nothing in 601 use referencepostcode instead + short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull()) + updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"] + + # Top up all new postcodes so they're all eight characters exactly + postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"] + for col in postcode_cols: + updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup) updated_snapshot_df = updated_snapshot_df.sort_values( ["reference", "instance"], ascending=[True, True] diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 8c1ddee7f..dc4df2a4a 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -27,6 +27,7 @@ global: output_outlier_qa : False output_estimation_qa: False output_apportionment_qa: False + output_imputation_qa: False output_long_form: False output_short_form: False output_gb_sas: False @@ -36,7 +37,7 @@ global: output_intram_by_itl1: False output_intram_by_civil_defence: False output_intram_by_sic: False - output_status_filtered: True + output_status_filtered: False output_fte_total_qa: False years: current_year: 2022 # TODO: put this in the userconfig From faac8f5ccda56e797e0b7954f6de590c5aec7e91 Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 14:51:55 +0000 Subject: [PATCH 23/26] correct previous merge error in validation --- src/developer_config.yaml | 6 +-- src/staging/validation.py | 77 ++++++++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 20ed94430..60aa0136b 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -12,7 +12,7 @@ global: load_ni_data: False load_historic_data: False run_construction: False - run_ni_construction: True + run_ni_construction: False load_manual_outliers: False load_manual_imputation: False load_backdata: True # whether to load previous year data for MoR @@ -26,8 +26,8 @@ global: output_imputation_qa: False output_auto_outliers: False output_outlier_qa : False - output_estimation_qa: False - output_apportionment_qa: False + output_estimation_qa: True + output_apportionment_qa: True output_long_form: False output_short_form: False output_gb_sas: False diff --git a/src/staging/validation.py b/src/staging/validation.py index 417a28116..ca0274209 100644 --- a/src/staging/validation.py +++ b/src/staging/validation.py @@ -334,6 +334,9 @@ def load_schema(file_path: str = "./config/contributors_schema.toml") -> dict: toml_dict = toml.load(file_path) else: # Return False if file does not exist + ValidationLogger.warning( + "Validation schema does not exist! Path may be incorrect" + ) return file_exists return toml_dict @@ -416,6 +419,9 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): # Load schema from toml dtypes_schema = load_schema(schema_path) + if not dtypes_schema: + raise FileNotFoundError(f"File at {schema_path} does not exist. Check path") + # Create a dict for dtypes only dtypes_dict = { column_nm: dtypes_schema[column_nm]["Deduced_Data_Type"] @@ -442,6 +448,15 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): survey_df[column] = survey_df[column].astype(pd.Int64Dtype()) elif dtypes_dict[column] == "str": survey_df[column] = survey_df[column].astype("string") + elif "datetime" in dtypes_dict[column]: + try: + survey_df[column] = pd.to_datetime( + survey_df[column], errors="coerce" + ) + except TypeError: + raise TypeError( + f"Failed to convert column '{column}' to datetime. Please check the data." + ) else: survey_df[column] = survey_df[column].astype(dtypes_dict[column]) ValidationLogger.debug(f"{column} after: {survey_df[column].dtype}") @@ -551,22 +566,31 @@ def check_ultfoc(value): @time_logger_wrap @exception_wrap -def validate_many_to_one( - mapper: pd.DataFrame, col_many: str, col_one: str -) -> pd.DataFrame: +def validate_many_to_one(*args) -> pd.DataFrame: """ + Validates a many-to-one mapper DataFrame. - Validates a many to one mapper: - 1. Checks if the mapper has two columns col_many and col_one. - 2. Salects and deduplicates col_many and col_one. - 3. Checks that for each entry in col_many there is exactly one entry in - col_one. + This function performs the following checks: + 1. Checks if the mapper has two specified columns, referred to as 'col_many' and 'col_one'. + 2. Selects and deduplicates 'col_many' and 'col_one'. + 3. Checks that for each entry in 'col_many' there is exactly one corresponding entry in 'col_one'. Args: - df (pd.DataFrame): The input mapper - col_many (str): name of the column with many entries - col_one (str): name of the column with one entry + *args: Variable length argument list. It should contain the following items in order: + - df (pd.DataFrame): The input mapper DataFrame. + - col_many (str): The name of the column with many entries. + - col_one (str): The name of the column with one entry. + + Returns: + pd.DataFrame: The validated mapper DataFrame with deduplicated 'col_many' and 'col_one' columns. + + Raises: + ValueError: If the mapper does not have the 'col_many' and 'col_one' columns, or if there are multiple entries in 'col_one' for any entry in 'col_many'. """ + + mapper = args[0] + col_many = args[1] + col_one = args[2] try: # Check that expected column are present cols = mapper.columns @@ -588,7 +612,7 @@ def validate_many_to_one( ValidationLogger.info( "The following codes have multile mapping: \n {df_bad}" ) - raise ValueError(f"Mapper is many to many") + raise ValueError("Mapper is many to many") return df except ValueError as ve: @@ -625,7 +649,7 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame: df["contents_check"] = status_check & from_status_check # Check if there are any False values in the "contents_check" column - if (df["contents_check"] == False).any(): + if (df["contents_check"] == False).any(): # noqa raise ValueError("Unexpected format within column contents") # Drop the "contents_check" column @@ -635,3 +659,30 @@ def validate_cora_df(df: pd.DataFrame) -> pd.DataFrame: except ValueError as ve: raise ValueError("cora status mapper validation failed: " + str(ve)) + + +def flag_no_rand_spenders(df, raise_or_warn): + """ + Flags any records that answer "No" to "604" and also report their expenditure in "211" as more than 0. + + Parameters: + df (pandas.DataFrame): The input DataFrame. + + Returns: + None + """ + invalid_records = df.loc[(df["604"] == "No") & (df["211"] > 0)] + + if not invalid_records.empty: + if raise_or_warn == "raise": + raise Exception("Some records report no R&D, but spend in 211 > 0.") + elif raise_or_warn == "warn": + total_invalid_spend = invalid_records["211"].sum() + ValidationLogger.error("Some records report no R&D, but spend in 211 > 0.") + ValidationLogger.error( + f"The total spend of 'No' R&D companies is £{int(total_invalid_spend)}" + ) + ValidationLogger.error(invalid_records) + + else: + ValidationLogger.debug("All records have valid R&D spend.") From dd4092e63e64ee669f731f59e3053f1713daa567 Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 16:23:00 +0000 Subject: [PATCH 24/26] fill nulls in MoR and CF --- src/_version.py | 2 +- src/developer_config.yaml | 4 ++-- src/imputation/MoR.py | 12 +++++++----- src/imputation/imputation_main.py | 7 ++++--- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/_version.py b/src/_version.py index 43c4ab005..22049ab2c 100644 --- a/src/_version.py +++ b/src/_version.py @@ -1 +1 @@ -__version__ = "0.6.1" +__version__ = "0.6.2" diff --git a/src/developer_config.yaml b/src/developer_config.yaml index 2b4190ede..bc1052920 100644 --- a/src/developer_config.yaml +++ b/src/developer_config.yaml @@ -11,7 +11,7 @@ global: load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions load_ni_data: False load_historic_data: False - run_construction: False + run_construction: True run_ni_construction: False load_manual_outliers: False load_manual_imputation: False @@ -87,7 +87,7 @@ network_paths: backdata_path: "R:/BERD Results System Development 2023/2021_data/validation-extract-responses-202112.csv" outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers" manual_outliers_path: "R:/BERD Results System Development 2023/DAP_emulation/outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv" - construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/JC_test_construction_file.csv" + construction_file_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_file.csv" construction_file_path_ni: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/test_construction_ni_file.csv" # construction_add_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_additions_2023-11-06_v5.csv" # TODO Need to test # construction_amend_path: "R:/BERD Results System Development 2023/DAP_emulation/construction/manual_construction/construction_amendments_2023-10-31_v2.csv" # TODO Need to test diff --git a/src/imputation/MoR.py b/src/imputation/MoR.py index 3878a46d4..e4d6ad82f 100644 --- a/src/imputation/MoR.py +++ b/src/imputation/MoR.py @@ -124,15 +124,17 @@ def carry_forwards(df, backdata, impute_vars): # Copy values from relevant columns where references match match_cond = df["_merge"] == "both" - # replace the values of certain columns with the values from the back data - # TODO: Check with methodology or BAU as to which other cols to take from backdata - # TODO: By default, columns not updated such as 4xx, 5xx will contain the current - # data, instance 0. + # Replace the values of certain columns with the values from the back data replace_vars = ["instance", "200", "201", "601", "602", "604"] for var in replace_vars: df.loc[match_cond, var] = df.loc[match_cond, f"{var}_prev"] + + # Update the varibles to be imputed by the corresponding previous values, filling + # nulls with zeros. for var in impute_vars: - df.loc[match_cond, f"{var}_imputed"] = df.loc[match_cond, f"{var}_prev"] + df.loc[match_cond, f"{var}_imputed"] = df.loc[ + match_cond, f"{var}_prev" + ].fillna(0) df.loc[match_cond, "imp_marker"] = "CF" df.loc[match_cond] = create_imp_class_col(df, "200_prev", "201_prev") diff --git a/src/imputation/imputation_main.py b/src/imputation/imputation_main.py index 64e9e6ca1..0dc271a2f 100644 --- a/src/imputation/imputation_main.py +++ b/src/imputation/imputation_main.py @@ -125,13 +125,14 @@ def run_imputation( # Changing all records that meet the criteria to "604" == "Yes" imputed_df.loc[(chk_mask & imputation_mask), "604"] = "Yes" - # Run short form expansion - imputed_df = run_sf_expansion(imputed_df, config) - # join constructed rows back to the imputed df + # Note that constructed rows need to be included in short form expansion if "is_constructed" in df.columns: imputed_df = pd.concat([imputed_df, constructed_df]) + # Run short form expansion + imputed_df = run_sf_expansion(imputed_df, config) + # join manually trimmed columns back to the imputed df if not trimmed_df.empty: imputed_df = pd.concat([imputed_df, trimmed_df]) From 8968c43e827c1187817c208c329f9b18fd42c047 Mon Sep 17 00:00:00 2001 From: Griffith Date: Mon, 15 Jan 2024 17:36:56 +0000 Subject: [PATCH 25/26] 654 bugfix in progress --- src/imputation/MoR.py | 3 +++ src/imputation/sf_expansion.py | 35 +++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/imputation/MoR.py b/src/imputation/MoR.py index e4d6ad82f..8b5c3c9ef 100644 --- a/src/imputation/MoR.py +++ b/src/imputation/MoR.py @@ -10,6 +10,7 @@ calculate_totals, ) + good_statuses = ["Clear", "Clear - overridden"] bad_statuses = ["Form sent out", "Check needed"] @@ -112,6 +113,8 @@ def carry_forwards(df, backdata, impute_vars): df = pd.merge( df, backdata, how="left", on="reference", suffixes=("", "_prev"), indicator=True ) + # ensure the instance columns are still type "int" after merge + df = df.astype({"instance": "Int64", "instance_prev": "Int64"}) # keep only the rows needed, see function docstring for details. no_match_cond = df["_merge"] == "left_only" diff --git a/src/imputation/sf_expansion.py b/src/imputation/sf_expansion.py index 025fe43d2..82b534519 100644 --- a/src/imputation/sf_expansion.py +++ b/src/imputation/sf_expansion.py @@ -165,19 +165,44 @@ def apply_expansion( return expanded_df +def prepare_short_form_constructed(df: pd.DataFrame, master_cols: List) -> pd.DataFrame: + """Prepare the constructed short form responses for sf expansion. + + The constructed records were removed from imputation, so it is necessary to copy + the master columns to the empty "imputed" master columns. + It is also necessary to create imputation classes for these records. + + For example, column "211" needs to be copied to "211_imputed" in these cases. + """ + sf_constructed_mask = (df.formtype == "0006") & (df.imp_marker == "constructed") + + # Create imputation class for the short + df.loc[sf_constructed_mask, "imp_class"] = ( + df.loc[sf_constructed_mask, "200"] + df.loc[sf_constructed_mask, "201"] + ) + + # Copy the values of the master columns to the corresponding "_imputed" column + for col in master_cols: + df.loc[sf_constructed_mask, f"{col}_imputed"] = df.loc[sf_constructed_mask, col] + + return df + + @df_change_func_wrap def run_sf_expansion(df: pd.DataFrame, config: dict) -> pd.DataFrame: """Calculate the expansion imputated values for short forms using long form data.""" - - # Remove records that have the reference list variables - # and those that have "nan" in the imp class - filtered_df, excluded_df = split_df_on_imp_class(df) - # Get dictionary of short form master keys (or target variables) # and breakdown variables breakdown_dict = config["breakdowns"] master_values = list(breakdown_dict) + # Prepare constructed short-form entries for sf expansion imputation + df = prepare_short_form_constructed(df, master_values) + + # Remove records that have the reference list variables + # and those that have "nan" in the imp class + filtered_df, excluded_df = split_df_on_imp_class(df) + # Obtain the "threshold_num" from the config # (this is the minimum viable number in an imputation class) threshold_num = config["imputation"]["sf_expansion_threshold"] From e5b0d9ec14647966d3d13365a0de20cc7efea1c4 Mon Sep 17 00:00:00 2001 From: Griffith Date: Tue, 16 Jan 2024 08:54:00 +0000 Subject: [PATCH 26/26] 654 add underscore to imputation class creation --- src/imputation/sf_expansion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imputation/sf_expansion.py b/src/imputation/sf_expansion.py index 82b534519..bfbe9efde 100644 --- a/src/imputation/sf_expansion.py +++ b/src/imputation/sf_expansion.py @@ -178,7 +178,7 @@ def prepare_short_form_constructed(df: pd.DataFrame, master_cols: List) -> pd.Da # Create imputation class for the short df.loc[sf_constructed_mask, "imp_class"] = ( - df.loc[sf_constructed_mask, "200"] + df.loc[sf_constructed_mask, "201"] + df.loc[sf_constructed_mask, "200"] + "_" + df.loc[sf_constructed_mask, "201"] ) # Copy the values of the master columns to the corresponding "_imputed" column