From e6c84d8d9944341dcca73c60cd36a4b3d3844cda Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 13 Apr 2021 14:47:20 -0700 Subject: [PATCH] NANS for HHS: * add missing columns --- hhs_hosp/delphi_hhs/run.py | 28 +++++++++++++++++-------- hhs_hosp/tests/test_run.py | 43 +++++++++++++++++++++++++------------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/hhs_hosp/delphi_hhs/run.py b/hhs_hosp/delphi_hhs/run.py index 590b9ffa91..db4b91b7ce 100644 --- a/hhs_hosp/delphi_hhs/run.py +++ b/hhs_hosp/delphi_hhs/run.py @@ -8,9 +8,7 @@ import time from delphi_epidata import Epidata -from delphi_utils.export import create_export_csv -from delphi_utils.geomap import GeoMapper -from delphi_utils import get_structured_logger +from delphi_utils import create_export_csv, get_structured_logger, Nans, GeoMapper import numpy as np import pandas as pd @@ -63,6 +61,17 @@ def generate_date_ranges(start, end): output.append(Epidata.range(_date_to_int(start), _date_to_int(end))) return output +def add_nancodes(df): + """Add nancodes to a signal dataframe.""" + # Default missingness codes + df["missing_val"] = Nans.NOT_MISSING + df["missing_se"] = Nans.NOT_APPLICABLE + df["missing_sample_size"] = Nans.NOT_APPLICABLE + + # Mark any remaining nans with unknown + remaining_nans_mask = df["val"].isnull() + df.loc[remaining_nans_mask, "missing_val"] = Nans.UNKNOWN + return df def run_module(params): """ @@ -99,12 +108,15 @@ def run_module(params): geo_mapper = GeoMapper() for sig in SIGNALS: - state = geo_mapper.add_geocode(make_signal(all_columns, sig), - "state_id", "state_code", - from_col="state") + state = make_signal(all_columns, sig) + state = geo_mapper.add_geocode(state, "state_id", "state_code", from_col="state") for geo in GEOS: + df = make_geo(state, geo, geo_mapper) + df["se"] = np.nan + df["sample_size"] = np.nan + df = add_nancodes(df) create_export_csv( - make_geo(state, geo, geo_mapper), + df, params["common"]["export_dir"], geo, sig @@ -123,8 +135,6 @@ def make_geo(state, geo, geo_mapper): state, "state_code", geo, new_col="geo_id", date_col="timestamp") - exported["se"] = np.nan - exported["sample_size"] = np.nan return exported def make_signal(all_columns, sig): diff --git a/hhs_hosp/tests/test_run.py b/hhs_hosp/tests/test_run.py index bf5478bf2d..c3dfc47b4b 100644 --- a/hhs_hosp/tests/test_run.py +++ b/hhs_hosp/tests/test_run.py @@ -1,10 +1,10 @@ from datetime import datetime, date from unittest.mock import patch -from delphi_hhs.run import _date_to_int, int_date_to_previous_day_datetime, generate_date_ranges, \ +from delphi_hhs.run import _date_to_int, add_nancodes, int_date_to_previous_day_datetime, generate_date_ranges, \ make_signal, make_geo, run_module from delphi_hhs.constants import CONFIRMED, SUM_CONF_SUSP -from delphi_utils.geomap import GeoMapper +from delphi_utils import GeoMapper, Nans from freezegun import freeze_time import numpy as np import pandas as pd @@ -72,7 +72,7 @@ def test_make_geo(): """Check that geographies transform correctly.""" test_timestamp = datetime(year=2020, month=1, day=1) geo_mapper = GeoMapper() - + data = pd.DataFrame({ 'state': ['PA','WV','OH'], 'state_code': [42, 54, 39], @@ -80,30 +80,23 @@ def test_make_geo(): 'val': [1, 2, 4], }) - template = { - 'se': np.nan, - 'sample_size': np.nan, - } expecteds = { "state": pd.DataFrame( - dict(template, - geo_id=data.state, + dict(geo_id=data.state, timestamp=data.timestamp, val=data.val)), "hhs": pd.DataFrame( - dict(template, - geo_id=['3', '5'], + dict(geo_id=['3', '5'], timestamp=[test_timestamp]*2, val=[3, 4])), "nation": pd.DataFrame( - dict(template, - geo_id=['us'], + dict(geo_id=['us'], timestamp=[test_timestamp], val=[7])) } for geo, expected in expecteds.items(): result = make_geo(data, geo, geo_mapper) - for series in ["geo_id", "timestamp", "val", "se", "sample_size"]: + for series in ["geo_id", "timestamp", "val"]: pd.testing.assert_series_equal(expected[series], result[series], obj=f"{geo}:{series}") @@ -131,3 +124,25 @@ def test_ignore_last_range_no_results(mock_covid_hosp, mock_export): } } assert not run_module(params) # function should not raise value error and has no return value + +def test_add_nancode(): + data = pd.DataFrame({ + 'state': ['PA','WV','OH'], + 'state_code': [42, 54, 39], + 'timestamp': [pd.to_datetime("20200601")]*3, + 'val': [1, 2, np.nan], + 'se': [np.nan] * 3, + 'sample_size': [np.nan] * 3, + }) + expected = pd.DataFrame({ + 'state': ['PA','WV','OH'], + 'state_code': [42, 54, 39], + 'timestamp': [pd.to_datetime("20200601")]*3, + 'val': [1, 2, np.nan], + 'se': [np.nan] * 3, + 'sample_size': [np.nan] * 3, + 'missing_val': [Nans.NOT_MISSING] * 2 + [Nans.UNKNOWN], + 'missing_se': [Nans.NOT_APPLICABLE] * 3, + 'missing_sample_size': [Nans.NOT_APPLICABLE] * 3, + }) + pd.testing.assert_frame_equal(expected, add_nancodes(data))