Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix non-matching postcodes and missing short forms in construction #181

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/construction/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Callable

from src.staging.validation import validate_data_with_schema
from src.staging.staging_helpers import postcode_topup
from src.outputs.outputs_helpers import create_period_year

construction_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -87,6 +88,10 @@ def run_construction(
updated_snapshot_df = create_period_year(updated_snapshot_df)
construction_df = create_period_year(construction_df)

# Set instance=1 so longforms with status 'Form sent out' match correctly
form_sent_condition = (updated_snapshot_df.formtype == "0001") & (updated_snapshot_df.status == "Form sent out")
updated_snapshot_df.loc[form_sent_condition, "instance"] = 1

# NI data has no instance but needs an instance of 1
if is_northern_ireland:
construction_df["instance"] = 1
Expand Down Expand Up @@ -115,6 +120,19 @@ def run_construction(
{"reference": "Int64", "instance": "Int64", "period_year": "Int64"}
)

# Long form records with a postcode in 601 use this as the postcode
long_form_cond = (~updated_snapshot_df["601"].isnull())
updated_snapshot_df.loc[long_form_cond, "postcodes_harmonised"] = updated_snapshot_df["601"]

# Short form records with nothing in 601 use referencepostcode instead
short_form_cond = (updated_snapshot_df["601"].isnull()) & (~updated_snapshot_df["referencepostcode"].isnull())
updated_snapshot_df.loc[short_form_cond, "postcodes_harmonised"] = updated_snapshot_df["referencepostcode"]

# Top up all new postcodes so they're all eight characters exactly
postcode_cols = ["601", "referencepostcode", "postcodes_harmonised"]
for col in postcode_cols:
updated_snapshot_df[col] = updated_snapshot_df[col].apply(postcode_topup)

construction_logger.info(f"Construction edited {construction_df.shape[0]} rows.")

return updated_snapshot_df
4 changes: 2 additions & 2 deletions src/developer_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ global:
# Staging and validation settings
postcode_csv_check: False
load_updated_snapshot: False # Whether to load the updated snapshots for amendments and additions
load_ni_data: True
load_ni_data: False
load_historic_data: False
run_construction: False
run_ni_construction: True
Expand Down Expand Up @@ -37,7 +37,7 @@ global:
output_intram_by_itl1: False
output_intram_by_civil_defence: False
output_intram_by_sic: False
output_status_filtered: False
output_status_filtered: True
output_fte_total_qa: False
years:
current_year: 2022 # TODO: put this in the userconfig
Expand Down
5 changes: 2 additions & 3 deletions src/staging/staging_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def postcode_topup(mystr: str, target_len: int = 8) -> str:
spaces and cuts the tail on the right.
If there is only one part, keeps the first 8 characters and tops it up with
spaces on the right if needed.
Empty input string would have zero parts and will return a string of
eight spaces.
Empty input string would have zero parts and will return an empty string.

Args:
mystr (str): Input postcode.
Expand Down Expand Up @@ -69,7 +68,7 @@ def postcode_topup(mystr: str, target_len: int = 8) -> str:
return (part1 + part2)[:target_len]

else:
return mystr[:target_len].ljust(target_len, " ")
return ""


def fix_anon_data(responses_df, config):
Expand Down