Skip to content

Commit

Permalink
#38: adding compression options
Browse files Browse the repository at this point in the history
  • Loading branch information
rjhd2 committed Apr 13, 2022
1 parent d6bd6b5 commit 67310bd
Showing 1 changed file with 19 additions and 12 deletions.
31 changes: 19 additions & 12 deletions PYTHON_CDM_Conversion_code/monthly_to_cdm_all_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@


# Set the file extension for the monthly obs psv files
EXTENSION = 'csv'
IN_EXTENSION = ".csv"
OUT_EXTENSION = ".psv"
COMPRESSION = ".gz"

LITE_COLS = ["STATION", "LATITUDE", "LONGITUDE", "ELEVATION", "DATE", "NAME", "PRCP", "TMIN", "TMAX", "TAVG", "SNOW", "AWND"]


Expand Down Expand Up @@ -240,7 +243,7 @@ def main(station="", subset="", run_all=False, clobber=False):

# Obtain list of station(s) to process (single/subset/all)
all_filenames = utils.get_station_list_to_process(utils.MONTHLY_CSV_IN_DIR,
EXTENSION,
f"{IN_EXTENSION}{COMPRESSION}",
station=station,
subset=subset,
run_all=run_all,
Expand All @@ -251,23 +254,27 @@ def main(station="", subset="", run_all=False, clobber=False):
data_policy_df = data_policy_df.astype(str)

for filename in all_filenames:
print(f"Processing {filename}")
if not os.path.exists(filename):
print("Input {} file missing: {}".format(IN_EXTENSION, filename))
continue
else:
print(f"Processing {filename}")

# using lambda as no all columns present in each file
df = pd.read_csv(filename, sep=",", usecols=lambda c: c in LITE_COLS)
df = pd.read_csv(filename, sep=",", usecols=lambda c: c in LITE_COLS, compression='infer')

# set output filenames
station_id = df.iloc[1]["STATION"] # NOTE: this is renamed below to "primary_station_id"
outroot_cdmlite = os.path.join(utils.MONTHLY_CDM_LITE_OUT_DIR, utils.MONTHLY_CDM_LITE_FILE_ROOT)
outroot_cdmobs = os.path.join(utils.MONTHLY_CDM_OBS_OUT_DIR, utils.MONTHLY_CDM_OBS_FILE_ROOT)
outroot_cdmhead = os.path.join(utils.MONTHLY_CDM_HEAD_OUT_DIR, utils.MONTHLY_CDM_HEAD_FILE_ROOT)
cdmlite_outfile = f"{outroot_cdmlite}{station_id}.psv"
cdmobs_outfile = f"{outroot_cdmobs}{station_id}.psv"
cdmhead_outfile = f"{outroot_cdmhead}{station_id}.psv"
cdmlite_outfile = f"{outroot_cdmlite}{station_id}{OUT_EXTENSION}{COMPRESSION}"
cdmobs_outfile = f"{outroot_cdmobs}{station_id}{OUT_EXTENSION}{COMPRESSION}"
cdmhead_outfile = f"{outroot_cdmhead}{station_id}{OUT_EXTENSION}{COMPRESSION}"

if not clobber:
# TODO - ensure all exist
if os.path.exists(cdmlite_outfile):
# all output files exist
if os.path.exists(cdmlite_outfile) and os.path.exists(cdmobs_outfile) and os.path.exists(cdmhead_outfile):
print(f" Output files for {filename} already exist:")
print(f" {cdmlite_outfile}")
print(f" {cdmobs_outfile}")
Expand Down Expand Up @@ -703,7 +710,7 @@ def main(station="", subset="", run_all=False, clobber=False):


try:
df_lite_out.to_csv(cdmlite_outfile, index=False, sep="|")
df_lite_out.to_csv(cdmlite_outfile, index=False, sep="|", compression='infer')
print(f" {cdmlite_outfile}")
except IOError:
print(f"Cannot save datafile: {cdmlite_outfile}")
Expand All @@ -712,14 +719,14 @@ def main(station="", subset="", run_all=False, clobber=False):
try:
unique_variables = dfobs['observed_variable'].unique()
print(unique_variables)
dfobs.to_csv(cdmobs_outfile, index=False, sep="|")
dfobs.to_csv(cdmobs_outfile, index=False, sep="|", compression='infer')
print(f" {cdmobs_outfile}")
except IOError:
print(f"Cannot save datafile: {cdmobs_outfile}")
continue

try:
hdf.to_csv(cdmhead_outfile, index=False, sep="|")
hdf.to_csv(cdmhead_outfile, index=False, sep="|", compression='infer')
print(f" {cdmhead_outfile}")
except IOError:
print(f"Cannot save datafile: {cdmhead_outfile}")
Expand Down

0 comments on commit 67310bd

Please sign in to comment.