Nancodes for claims_hosp:

* add missing columns, allow nan values
cmu-delphi · Nov 9, 2021 · e804677 · e804677
1 parent 0b7103a
commit e804677
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 15 deletions.
diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py
@@ -9,7 +9,6 @@
 
 from delphi_utils import create_export_csv, Nans
 
-
 def _clean_directory(directory):
     """Clean files out of a directory."""
     for fname in listdir(directory):

diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py
@@ -13,7 +13,7 @@
 # third party
 import numpy as np
 import pandas as pd
-from delphi_utils import GeoMapper
+from delphi_utils import GeoMapper, Nans
 
 # first party
 from delphi_utils import Weekday
@@ -235,7 +235,7 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
         all_rates = output_dict["rates"]
         all_se = output_dict["se"]
         all_include = output_dict["include"]
-        out_n = 0
+        out_n, out_i = 0, 0
         for i, date in enumerate(dates):
             filename = "%s/%s_%s_%s.csv" % (
                 output_path,
@@ -244,7 +244,10 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
                 self.signal_name,
             )
             with open(filename, "w") as outfile:
-                outfile.write("geo_id,val,se,direction,sample_size\n")
+                outfile.write(
+                    "geo_id,val,se,direction,sample_size," +
+                    "missing_val,missing_se,missing_sample_size\n"
+                )
                 for geo_id in geo_ids:
                     val = all_rates[geo_id][i]
                     se = all_se[geo_id][i]
@@ -257,11 +260,38 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
                         if self.write_se:
                             assert val > 0 and se > 0, "p=0, std_err=0 invalid"
                             outfile.write(
-                                "%s,%f,%s,%s,%s\n" % (geo_id, val, se, "NA", "NA"))
+                                "%s,%f,%s,%s,%s,%d,%d,%d\n" % (
+                                    geo_id, val, se, "NA", "NA",
+                                    Nans.NOT_MISSING.value,
+                                    Nans.NOT_MISSING.value,
+                                    Nans.NOT_APPLICABLE.value
+                                )
+                            )
                         else:
                             # for privacy reasons we will not report the standard error
                             outfile.write(
-                                "%s,%f,%s,%s,%s\n" % (geo_id, val, "NA", "NA", "NA"))
+                                "%s,%f,%s,%s,%s,%d,%d,%d\n" % (
+                                    geo_id, val, "NA", "NA", "NA",
+                                    Nans.NOT_MISSING.value,
+                                    Nans.CENSORED.value,
+                                    Nans.NOT_APPLICABLE.value
+                                )
+                            )
                         out_n += 1
+                    else:
+                        # Write nans out anyway for versioning
+                        logging.warning("writing insufficient data for geo_id {0}, {1}".format(
+                            geo_id, i
+                        ))
+                        outfile.write(
+                            "%s,%s,%s,%s,%s,%d,%d,%d\n" % (
+                                geo_id, "NA", "NA", "NA", "NA",
+                                Nans.CENSORED.value,
+                                Nans.CENSORED.value,
+                                Nans.NOT_APPLICABLE.value
+                            )
+                        )
+                        out_i += 1
 
-        logging.debug("wrote %d rows for %d %s", out_n, len(geo_ids), geo_level)
+        logging.debug("wrote %d valued csvs for %d %s", out_n, len(geo_ids), geo_level)
+        logging.debug("wrote %d nan-valued csvs for %d %s", out_i, len(geo_ids), geo_level)
diff --git a/claims_hosp/tests/test_indicator.py b/claims_hosp/tests/test_indicator.py
@@ -57,13 +57,12 @@ def test_fit_fips(self):
         date_range = pd.date_range("2020-05-01", "2020-05-20")
         all_fips = self.fips_data.fips.unique()
         loc_index_fips_data = self.fips_data.set_index(["fips", "timestamp"])
-        sample_fips = nr.choice(all_fips, 10)
+        sample_fips = all_fips[:50]
 
         for fips in sample_fips:
             sub_data = loc_index_fips_data.loc[fips]
             sub_data = sub_data.reindex(date_range, fill_value=0)
             res0 = ClaimsHospIndicator.fit(sub_data, date_range[0], fips)
-            # first value is burn-in
             assert np.min(res0["rate"][1:]) > 0
             assert np.max(res0["rate"][1:]) <= 100
 

diff --git a/claims_hosp/tests/test_update_indicator.py b/claims_hosp/tests/test_update_indicator.py
@@ -144,8 +144,9 @@ def test_write_to_csv_results(self):
         expected_name = f"20200502_geography_{Config.signal_name}.csv"
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
+        expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
         assert (output_data.geo_id == ["a", "b"]).all()
         assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
@@ -159,10 +160,10 @@ def test_write_to_csv_results(self):
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
-        assert (output_data.geo_id == ["a"]).all()
-        assert np.array_equal(output_data.val.values, np.array([0.5]))
+        assert (output_data.geo_id == ["a", "b"]).all()
+        assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True)
         assert np.isnan(output_data.se.values).all()
         assert np.isnan(output_data.direction.values).all()
         assert np.isnan(output_data.sample_size.values).all()
@@ -171,7 +172,7 @@ def test_write_to_csv_results(self):
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
         assert (output_data.geo_id == ["a", "b"]).all()
         assert np.array_equal(output_data.val.values, np.array([1.5, 3]))
@@ -224,8 +225,9 @@ def test_write_to_csv_with_se_results(self):
         expected_name = f"20200502_geography_{signal_name}.csv"
         assert exists(join(td.name, expected_name))
         output_data = pd.read_csv(join(td.name, expected_name))
+        expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
         assert (
-                output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
+                output_data.columns == expected_columns
         ).all()
         assert (output_data.geo_id == ["a", "b"]).all()
         assert np.array_equal(output_data.val.values, np.array([0.1, 1]))