Skip to content

Commit

Permalink
Merge pull request #45 from metno/add-revision-string-to-metadata
Browse files Browse the repository at this point in the history
Add revison string to the metadata of readers
  • Loading branch information
jgriesfeller authored Aug 29, 2024
2 parents 6924bd1 + 2567562 commit d182098
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 10 deletions.
13 changes: 12 additions & 1 deletion src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from urllib.parse import urlparse
from urllib.request import urlopen
from zipfile import BadZipFile, ZipFile
import datetime

import numpy as np
import requests
Expand Down Expand Up @@ -73,7 +74,7 @@ def __init__(
filename,
filters=[],
fill_country_flag: bool = FILL_COUNTRY_FLAG,
tqdm_desc: [str, None] = None,
tqdm_desc: str | None = None,
ts_type: str = "daily",
):
"""open a new csv timeseries-reader
Expand Down Expand Up @@ -101,6 +102,7 @@ def __init__(
self._set_filters(filters)
self._header = []
_laststatstr = ""
self._revision = datetime.datetime.min

# check if file is a URL
if self.is_valid_url(self._filename):
Expand Down Expand Up @@ -217,6 +219,12 @@ def __init__(
day, month, year = row[DATE_NAME].split(":")
datestring = "-".join([year, month, day])
datestring = "T".join([datestring, row[TIME_NAME]])
self._revision = max(
[
self._revision,
datetime.datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S"),
]
)
time_dummy = np.datetime64(datestring)
start = time_dummy - TS_TYPE_DIFFS[ts_type]
end = time_dummy + TS_TYPE_DIFFS[ts_type]
Expand Down Expand Up @@ -258,6 +266,9 @@ def __init__(
)
bar.close()

def metadata(self):
return dict(revision=datetime.datetime.strftime(self._revision, "%y%m%d%H%M%S"))

def _unfiltered_data(self, varname) -> Data:
return self._data[varname]

Expand Down
14 changes: 12 additions & 2 deletions src/pyaro_readers/aeronetsunreader/AeronetSunTimeseriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Station,
)
from tqdm import tqdm
import datetime

# default URL
BASE_URL = "https://aeronet.gsfc.nasa.gov/data_push/V3/All_Sites_Times_Daily_Averages_AOD20.zip"
Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(
filename,
filters=[],
fill_country_flag: bool = FILL_COUNTRY_FLAG,
tqdm_desc: [str, None] = None,
tqdm_desc: str | None = None,
ts_type: str = "daily",
):
"""open a new Aeronet timeseries-reader
Expand Down Expand Up @@ -90,7 +91,7 @@ def __init__(
self._set_filters(filters)
self._header = []
_laststatstr = ""

self._revision = datetime.datetime.min
# check if file is a URL
if self.is_valid_url(self._filename):
# try to open as zipfile
Expand Down Expand Up @@ -165,6 +166,12 @@ def __init__(
day, month, year = row[DATE_NAME].split(":")
datestring = "-".join([year, month, day])
datestring = "T".join([datestring, row[TIME_NAME]])
self._revision = max(
[
self._revision,
datetime.datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S"),
]
)
time_dummy = np.datetime64(datestring)
start = time_dummy - TS_TYPE_DIFFS[ts_type]
end = time_dummy + TS_TYPE_DIFFS[ts_type]
Expand Down Expand Up @@ -192,6 +199,9 @@ def __init__(
)
bar.close()

def metadata(self):
return dict(revision=datetime.datetime.strftime(self._revision, "%y%m%d%H%M%S"))

def _unfiltered_data(self, varname) -> Data:
return self._data[varname]

Expand Down
24 changes: 24 additions & 0 deletions src/pyaro_readers/ascii2netcdf/Ascii2NetcdfTimeseries.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
import datetime
import glob
import inspect
import logging
Expand All @@ -13,6 +14,7 @@
Station,
)
import pyaro.timeseries.Filter
import xarray as xr

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -85,6 +87,28 @@ def __init__(
)
return

def iterate_files(self):
for y in self._years:
file_path = os.path.join(self._directory, f"data_{self._resolution}.{y}.nc")
if os.path.exists(file_path):
yield file_path

def metadata(self):
metadata = dict()
date = datetime.datetime.min
for f in self.iterate_files():
with xr.open_dataset(f) as d:
hist: str = d.attrs.get("last_changed", "")

datestr = hist.split("//")[0]
new_date = datetime.datetime.strptime(datestr, "%a %b %d %H:%M:%S %Y")
if new_date > date:
date = new_date

metadata["revision"] = datetime.datetime.strftime(date, "%y%m%d%H%M%S")

return metadata

def _is_year_in_filters(self, year):
start_year = np.datetime64(f"{year}-01-01 00:00:00")
end_year = np.datetime64(f"{year}-12-31 23:59:59")
Expand Down
19 changes: 18 additions & 1 deletion src/pyaro_readers/harpreader/harpreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from tqdm import tqdm
import cfunits
from pyaro_readers.units_helpers import UALIASES
import datetime

logger = logging.getLogger(__name__)

Expand All @@ -39,7 +40,7 @@ class AeronetHARPReader(AutoFilterReaderEngine.AutoFilterReader):

def __init__(
self,
file: [Path, str],
file: Path | str,
filters=[],
vars_to_read: list[str] = None,
):
Expand Down Expand Up @@ -97,6 +98,22 @@ def __init__(
)
bar.close()

def metadata(self):
metadata = dict()
date = datetime.datetime.min
for f in self._files:
with xr.open_dataset(f) as d:
hist: str = d.attrs.get("history", "")

datestr = ":".join(hist.split(":")[:3])
new_date = datetime.datetime.strptime(datestr, "%a %b %d %H:%M:%S %Y")
if new_date > date:
date = new_date

metadata["revision"] = datetime.datetime.strftime(date, "%y%m%d%H%M%S")

return metadata

def _read_file_variables(self, filename) -> dict[str, str]:
"""Returns a mapping of variable name to unit for the dataset.
Expand Down
38 changes: 38 additions & 0 deletions src/pyaro_readers/netcdf_rw/Netcdf_RWTimeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
Station,
)
import pyaro.timeseries.Filter
import xarray as xr
import datetime


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -67,6 +70,41 @@ def __init__(
raise Netcdf_RWTimeseriesException(f"unable to read definition-file: {ex}")
return

def iterate_files(self):
for y in self._years:
file_path = os.path.join(self._directory, f"{self.ncfile_prefix}.{y}.nc")
if os.path.exists(file_path):
yield file_path

def metadata(self):
metadata = dict()
date = datetime.datetime.min
for f in self.iterate_files():
with xr.open_dataset(f) as d:
hist = d.attrs.get("last_changed", None)

try:
datestr = hist.split("//")[0]
new_date = datetime.datetime.strptime(
datestr, "%a %b %d %H:%M:%S %Y"
)
except Exception:
try:
hist = d.attrs.get("history", "")[-1]
datestr = " ".join(hist.split(" ")[:2])
new_date = datetime.datetime.strptime(
datestr, "%Y-%m-%d %H:%M:%S"
)
except Exception:
new_date = datetime.datetime.min

if new_date > date:
date = new_date

metadata["revision"] = datetime.datetime.strftime(date, "%y%m%d%H%M%S")

return metadata

def _read_json(self, file, empty):
filepath = os.path.join(self._directory, file)
res = empty
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,15 @@ def __init__(
filename,
filters=[],
fill_country_flag: bool = FILL_COUNTRY_FLAG,
tqdm_desc: [str, None] = None,
tqdm_desc: str | None = None,
file_mask: str = FILE_MASK,
ts_type: str = "hourly",
):
self._stations = {}
self._data = {}
self._set_filters(filters)
self._header = []
self._revision = datetime.datetime.min

if Path(filename).is_file():
self._filename = filename
Expand Down Expand Up @@ -108,6 +109,14 @@ def _process_open_file(

startdate = "".join(lines[INDECIES["DATES"]].split()[:3])
startdate = datetime.datetime.strptime(startdate, "%Y%m%d")
self._revision = max(
[
self._revision,
datetime.datetime.strptime(
" ".join(lines[INDECIES["DATES"]].split()[3:]), "%Y %m %d"
),
]
)

lon = float(lines[INDECIES["LON"]].split(":")[1].strip())
lat = float(lines[INDECIES["LAT"]].split(":")[1].strip())
Expand Down Expand Up @@ -176,6 +185,14 @@ def _process_open_file(
np.nan,
)

def metadata(self):
return dict(revision=datetime.datetime.strftime(self._revision, "%y%m%d%H%M%S"))
# metadata = dict()
# metadata["revision"] = hashlib.md5(
# "".join(self._md5filehashes).encode()
# ).hexdigest()
# return metadata

def _unfiltered_data(self, varname) -> Data:
return self._data[varname]

Expand All @@ -200,7 +217,7 @@ def _lookup_function(self):
return lambda lat, lon: geo.lookup_nearest(lat, lon)["ISO_A2_EH"]


class NILUPMFAbsorptionTimeseriesEngine(AutoFilterReaderEngine.AutoFilterEngine):
class NILUPMFAbsorptionTimeseriesEngine(AutoFilterReaderEngine.AutoFilterEngine): #
def reader_class(self):
return NILUPMFAbsorptionReader

Expand Down
28 changes: 24 additions & 4 deletions src/pyaro_readers/nilupmfebas/EbasPmfReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from pathlib import Path
import re
import datetime


logger = logging.getLogger(__name__)

Expand All @@ -28,9 +30,9 @@ class EBASPMFReaderException(Exception):
class EbasPmfTimeseriesReader(AutoFilterReaderEngine.AutoFilterReader):
def __init__(
self,
filename: [Path, str],
filename: Path | str,
filters=[],
tqdm_desc: [str, None] = None,
tqdm_desc: str | None = None,
filemask: str = FILE_MASK,
vars_to_read: list[str] = None,
):
Expand All @@ -42,6 +44,7 @@ def __init__(
self._opts = {"default": ReadEbasOptions()}
self._variables = {}
self._metadata = {}
self._revision = datetime.datetime.min

# variable include filter comes like this
# {'variables': {'include': ['PM10_density']}}
Expand Down Expand Up @@ -73,9 +76,16 @@ def __init__(
# filename is something else
raise EBASPMFReaderException(f"No such file or directory: {filename}")

def metadata(self):
metadata = dict()
metadata["revision"] = datetime.datetime.strftime(
self._revision, "%y%m%d%H%M%S"
)
return metadata

def read_file_basic(
self,
filename: [Path, str],
filename: Path | str,
):
"""Read EBAS NASA Ames file
Expand All @@ -93,12 +103,22 @@ def read_file_basic(

return data_out

def read_file(self, filename: [Path, str], vars_to_read: list[str] = None):
def read_file(self, filename: Path | str, vars_to_read: list[str] = None):
"""Read EBAS NASA Ames file and put the data in the object"""

_file_dummy = self.read_file_basic(filename)
self._revision = max(
[
self._revision,
datetime.datetime.strptime(
_file_dummy.meta["revision_date"], "%Y%m%d%H%M%S"
),
]
)

matrix = _file_dummy.meta["matrix"]
vars_read_in_file = []

# multicolumn file: ebas var names come from _file_dummy.col_names_vars
for var_idx, var_def in enumerate(_file_dummy.var_defs):
# continue if the variable is not an actual data variable (but e.g. time)
Expand Down
10 changes: 10 additions & 0 deletions tests/test_AERONETSDATimeSeriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def test_dl_data_tared(self):
count += len(ts.data(var))
self.assertEqual(count, 421984)
self.assertEqual(len(ts.stations()), 94)
self.assertIn("revision", ts.metadata())
self.assertGreaterEqual(int(ts.metadata()["revision"]), 230726120000)

def test_dl_data_unzipped(self):
if not self.external_resource_available(TEST_URL):
Expand All @@ -63,6 +65,8 @@ def test_dl_data_unzipped(self):
count += len(ts.data(var))
self.assertEqual(count, 79944)
self.assertEqual(len(ts.stations()), 4)
self.assertIn("revision", ts.metadata())
self.assertGreaterEqual(int(ts.metadata()["revision"]), 220622120000)

def test_dl_data_zipped(self):
if not self.external_resource_available(TEST_ZIP_URL):
Expand All @@ -80,6 +84,9 @@ def test_dl_data_zipped(self):
self.assertEqual(count, 79944)
self.assertEqual(len(ts.stations()), 4)

self.assertIn("revision", ts.metadata())
self.assertGreaterEqual(int(ts.metadata()["revision"]), 220622120000)

def test_aeronet_data_zipped(self):
if not os.path.exists("/lustre"):
self.skipTest(f"lustre not available; skipping Aeronet download on CI")
Expand All @@ -99,6 +106,9 @@ def test_aeronet_data_zipped(self):
self.assertGreaterEqual(count, 49965)
self.assertGreaterEqual(len(ts.stations()), 4)

self.assertIn("revision", ts.metadata())
self.assertGreaterEqual(int(ts.metadata()["revision"]), 240523120000)

def test_init(self):
engine = pyaro.list_timeseries_engines()["aeronetsdareader"]
self.assertEqual(engine.url(), "https://github.com/metno/pyaro-readers")
Expand Down
Loading

0 comments on commit d182098

Please sign in to comment.