From 106f185eb531059c6929ab952bdc9466e0a9658a Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Fri, 15 Nov 2024 16:54:29 -0500 Subject: [PATCH 1/4] first pass at handling the new nhsn format for weekly data --- src/iddata/loader.py | 71 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/src/iddata/loader.py b/src/iddata/loader.py index 83be3c9..cb3e9f2 100644 --- a/src/iddata/loader.py +++ b/src/iddata/loader.py @@ -1,6 +1,7 @@ from itertools import product from urllib.parse import urljoin +import datetime import numpy as np import pandas as pd import pymmwr @@ -106,7 +107,7 @@ def load_one_us_census_file(self, f): def load_us_census(self, fillna = True): files = [ self._construct_data_raw_url("us-census/nst-est2019-alldata.csv"), - self._construct_data_raw_url("us-census/NST-EST2022-ALLDATA.csv")] + self._construct_data_raw_url("us-census/NST-EST2023-ALLDATA.csv")] us_pops = pd.concat([self.load_one_us_census_file(f) for f in files], axis=0) fips_mappings = pd.read_csv(self._construct_data_raw_url("fips-mappings/fips_mappings.csv")) @@ -128,7 +129,7 @@ def load_us_census(self, fillna = True): if fillna: all_locations = dat["location"].unique() - all_seasons = [str(y) + "/" + str(y+1)[-2:] for y in range(1997, 2024)] + all_seasons = [str(y) + "/" + str(y+1)[-2:] for y in range(1997, 2025)] full_result = pd.DataFrame.from_records(product(all_locations, all_seasons)) full_result.columns = ["location", "season"] dat = full_result.merge(dat, how="left", on=["location", "season"]) \ @@ -303,21 +304,26 @@ def load_ilinet(self, def load_nhsn(self, rates=True, drop_pandemic_seasons=True, as_of=None): - if drop_pandemic_seasons: - if as_of is None: - file_path = "influenza-hhs/hhs.csv" - else: - # find the largest stored file dated on or before the as_of date - as_of_file_path = f"influenza-hhs/hhs-{str(as_of)}.csv" - glob_results = s3fs.S3FileSystem(anon=True) \ - .glob("infectious-disease-data/data-raw/influenza-hhs/hhs-????-??-??.csv") - all_file_paths = sorted([f[len("infectious-disease-data/data-raw/"):] for f in glob_results]) - all_file_paths = [f for f in all_file_paths if f <= as_of_file_path] - file_path = all_file_paths[-1] + if not drop_pandemic_seasons: + raise NotImplementedError("Functionality for loading all seasons of NHSN data with specified as_of date is not implemented.") + + if as_of is None: + as_of = datetime.date.today().isoformat() + + if as_of < '2024-11-15': + return self.load_nhsn_from_hhs(rates=rates, as_of=as_of) else: - if as_of is not None: - raise NotImplementedError("Functionality for loading all seasons of NHSN data with specified as_of date is not implemented.") - file_path = "influenza-hhs/hhs_complete.csv" + return self.load_nhsn_from_nhsn(rates=rates, as_of=as_of) + + + def load_nhsn_from_hhs(self, rates=True, as_of=None): + # find the largest stored file dated on or before the as_of date + as_of_file_path = f"influenza-hhs/hhs-{str(as_of)}.csv" + glob_results = s3fs.S3FileSystem(anon=True) \ + .glob("infectious-disease-data/data-raw/influenza-hhs/hhs-????-??-??.csv") + all_file_paths = sorted([f[len("infectious-disease-data/data-raw/"):] for f in glob_results]) + all_file_paths = [f for f in all_file_paths if f <= as_of_file_path] + file_path = all_file_paths[-1] dat = pd.read_csv(self._construct_data_raw_url(file_path)) dat.rename(columns={"date": "wk_end_date"}, inplace=True) @@ -340,6 +346,39 @@ def load_nhsn(self, rates=True, drop_pandemic_seasons=True, as_of=None): return dat + def load_nhsn_from_nhsn(self, rates=True, as_of=None): + # find the largest stored file dated on or before the as_of date + as_of_file_path = f"influenza-nhsn/nhsn-{str(as_of)}.csv" + glob_results = s3fs.S3FileSystem(anon=True) \ + .glob("infectious-disease-data/data-raw/influenza-nhsn/nhsn-????-??-??.csv") + all_file_paths = sorted([f[len("infectious-disease-data/data-raw/"):] for f in glob_results]) + all_file_paths = [f for f in all_file_paths if f <= as_of_file_path] + file_path = all_file_paths[-1] + + dat = pd.read_csv(self._construct_data_raw_url(file_path)) + # Keeping Percent Hospitals Reporting field for now in case it's useful later. + dat = dat[['Geographic aggregation', 'Week Ending Date', 'Total Influenza Admissions', 'Percent Hospitals Reporting Influenza Admissions']] + dat.columns = ['abbreviation', 'wk_end_date', 'inc', 'pct_report'] + fips_mappings = self.load_fips_mappings() + dat = dat.merge(fips_mappings, on=["abbreviation"], how="left") + + ew_str = dat.apply(utils.date_to_ew_str, axis=1) + dat["season"] = utils.convert_epiweek_to_season(ew_str) + dat["season_week"] = utils.convert_epiweek_to_season_week(ew_str) + dat = dat.sort_values(by=["season", "season_week"]) + + if rates: + pops = self.load_us_census() + dat = dat.merge(pops, on = ["location", "season"], how="left") \ + .assign(inc=lambda x: x["inc"] / x["pop"] * 100000) + + dat["wk_end_date"] = pd.to_datetime(dat["wk_end_date"]) + + dat["agg_level"] = np.where(dat["location"] == "US", "national", "state") + dat = dat[["agg_level", "location", "season", "season_week", "wk_end_date", "inc"]] + dat["source"] = "nhsn" + return dat + def load_agg_transform_ilinet(self, fips_mappings, **ilinet_kwargs): df_ilinet_full = self.load_ilinet(**ilinet_kwargs) # df_ilinet_full.loc[df_ilinet_full['inc'] < np.exp(-7), 'inc'] = np.exp(-7) From 67c7aecc9d914825cab25835d1e1f544794a8f49 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Fri, 15 Nov 2024 16:57:22 -0500 Subject: [PATCH 2/4] ruff fix --- src/iddata/loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/iddata/loader.py b/src/iddata/loader.py index cb3e9f2..8316462 100644 --- a/src/iddata/loader.py +++ b/src/iddata/loader.py @@ -1,7 +1,7 @@ +import datetime from itertools import product from urllib.parse import urljoin -import datetime import numpy as np import pandas as pd import pymmwr @@ -310,7 +310,7 @@ def load_nhsn(self, rates=True, drop_pandemic_seasons=True, as_of=None): if as_of is None: as_of = datetime.date.today().isoformat() - if as_of < '2024-11-15': + if as_of < "2024-11-15": return self.load_nhsn_from_hhs(rates=rates, as_of=as_of) else: return self.load_nhsn_from_nhsn(rates=rates, as_of=as_of) @@ -357,8 +357,8 @@ def load_nhsn_from_nhsn(self, rates=True, as_of=None): dat = pd.read_csv(self._construct_data_raw_url(file_path)) # Keeping Percent Hospitals Reporting field for now in case it's useful later. - dat = dat[['Geographic aggregation', 'Week Ending Date', 'Total Influenza Admissions', 'Percent Hospitals Reporting Influenza Admissions']] - dat.columns = ['abbreviation', 'wk_end_date', 'inc', 'pct_report'] + dat = dat[["Geographic aggregation", "Week Ending Date", "Total Influenza Admissions", "Percent Hospitals Reporting Influenza Admissions"]] + dat.columns = ["abbreviation", "wk_end_date", "inc", "pct_report"] fips_mappings = self.load_fips_mappings() dat = dat.merge(fips_mappings, on=["abbreviation"], how="left") From 50565cbddb51a079efceaff9293b4bed10144a2c Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Fri, 15 Nov 2024 17:55:35 -0500 Subject: [PATCH 3/4] add US to new nhsn data, get tests to pass again --- src/iddata/loader.py | 31 +++++++++++++++++++++++++---- tests/iddata/unit/test_load_data.py | 4 ++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/iddata/loader.py b/src/iddata/loader.py index 8316462..99b7ae3 100644 --- a/src/iddata/loader.py +++ b/src/iddata/loader.py @@ -308,12 +308,19 @@ def load_nhsn(self, rates=True, drop_pandemic_seasons=True, as_of=None): raise NotImplementedError("Functionality for loading all seasons of NHSN data with specified as_of date is not implemented.") if as_of is None: - as_of = datetime.date.today().isoformat() + as_of = datetime.date.today() + + if type(as_of) == str: + as_of = datetime.date.fromisoformat(as_of) - if as_of < "2024-11-15": + if as_of < datetime.date.fromisoformat("2024-11-15"): return self.load_nhsn_from_hhs(rates=rates, as_of=as_of) else: - return self.load_nhsn_from_nhsn(rates=rates, as_of=as_of) + return self.load_nhsn_from_nhsn( + rates=rates, + as_of=as_of, + drop_pandemic_seasons=drop_pandemic_seasons + ) def load_nhsn_from_hhs(self, rates=True, as_of=None): @@ -346,7 +353,7 @@ def load_nhsn_from_hhs(self, rates=True, as_of=None): return dat - def load_nhsn_from_nhsn(self, rates=True, as_of=None): + def load_nhsn_from_nhsn(self, rates=True, as_of=None, drop_pandemic_seasons=True): # find the largest stored file dated on or before the as_of date as_of_file_path = f"influenza-nhsn/nhsn-{str(as_of)}.csv" glob_results = s3fs.S3FileSystem(anon=True) \ @@ -359,6 +366,19 @@ def load_nhsn_from_nhsn(self, rates=True, as_of=None): # Keeping Percent Hospitals Reporting field for now in case it's useful later. dat = dat[["Geographic aggregation", "Week Ending Date", "Total Influenza Admissions", "Percent Hospitals Reporting Influenza Admissions"]] dat.columns = ["abbreviation", "wk_end_date", "inc", "pct_report"] + + # add us data + us_dat = ( + dat + .groupby("wk_end_date") + ["inc"] + .sum() + .reset_index() + ) + us_dat["abbreviation"] = "US" + dat = pd.concat([dat, us_dat], axis=0) + + # get to location codes/FIPS fips_mappings = self.load_fips_mappings() dat = dat.merge(fips_mappings, on=["abbreviation"], how="left") @@ -366,6 +386,9 @@ def load_nhsn_from_nhsn(self, rates=True, as_of=None): dat["season"] = utils.convert_epiweek_to_season(ew_str) dat["season_week"] = utils.convert_epiweek_to_season_week(ew_str) dat = dat.sort_values(by=["season", "season_week"]) + + if drop_pandemic_seasons: + dat.loc[dat["season"].isin(["2020/21", "2021/22"]), "inc"] = np.nan if rates: pops = self.load_us_census() diff --git a/tests/iddata/unit/test_load_data.py b/tests/iddata/unit/test_load_data.py index 18126b0..d3c0efe 100644 --- a/tests/iddata/unit/test_load_data.py +++ b/tests/iddata/unit/test_load_data.py @@ -24,7 +24,7 @@ def test_load_data_sources(): @pytest.mark.parametrize("test_kwargs, season_expected, wk_end_date_expected", [ (None, "2022/23", "2023-12-23"), - ({"drop_pandemic_seasons": False}, "2019/20", "2023-12-23"), + # ({"drop_pandemic_seasons": False}, "2019/20", "2023-12-23"), ({"drop_pandemic_seasons": True, "as_of": datetime.date.fromisoformat("2023-12-30")}, "2022/23", "2023-12-23") ]) @@ -33,7 +33,7 @@ def test_load_data_nhsn_kwargs(test_kwargs, season_expected, wk_end_date_expecte df = fdl.load_data(sources=["nhsn"], nhsn_kwargs=test_kwargs) - assert df["season"].min() == season_expected + assert df.dropna()["season"].min() == season_expected wk_end_date_actual = str(df["wk_end_date"].max())[:10] if test_kwargs is not None and "as_of" in test_kwargs: assert wk_end_date_actual == wk_end_date_expected From 827341c136d8107b8c7a6369e73afa92730eae10 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Fri, 15 Nov 2024 17:57:38 -0500 Subject: [PATCH 4/4] use isinstance --- src/iddata/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iddata/loader.py b/src/iddata/loader.py index 99b7ae3..79b8e2f 100644 --- a/src/iddata/loader.py +++ b/src/iddata/loader.py @@ -310,7 +310,7 @@ def load_nhsn(self, rates=True, drop_pandemic_seasons=True, as_of=None): if as_of is None: as_of = datetime.date.today() - if type(as_of) == str: + if isinstance(as_of, str): as_of = datetime.date.fromisoformat(as_of) if as_of < datetime.date.fromisoformat("2024-11-15"):