Skip to content

Commit

Permalink
Merge pull request #325 from informatics-lab/optimize-eida50
Browse files Browse the repository at this point in the history
Optimize EIDA50
  • Loading branch information
andrewgryan authored Apr 1, 2020
2 parents a923d59 + b0824d5 commit 2d37165
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 85 deletions.
2 changes: 1 addition & 1 deletion forest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
.. automodule:: forest.presets
"""
__version__ = '0.13.4'
__version__ = '0.13.5'

from .config import *
from . import (
Expand Down
103 changes: 48 additions & 55 deletions forest/drivers/eida50.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
import glob
import datetime as dt
import bokeh.models
import netCDF4
import xarray
import numpy as np
from functools import lru_cache
from forest.exceptions import FileNotFound, IndexNotFound
from forest.old_state import old_state, unique
from forest.util import coarsify
from forest.util import to_datetime as _to_datetime
import forest.util
from forest import (
geo,
locate,
view)


ENGINE = "h5netcdf"
MIN_DATETIME64 = np.datetime64('0001-01-01T00:00:00.000000')


Expand All @@ -25,28 +25,14 @@ def _natargmax(arr):
return np.argmax(no_nats)


def infinite_cache(f):
"""Unbounded cache to reduce navigation I/O
.. note:: This information would be better saved in a database
or file to reduce round-trips to disk
"""
cache = {}
def wrapped(self, path, variable):
if path not in cache:
cache[path] = f(self, path, variable)
return cache[path]
return wrapped


class Dataset:
def __init__(self, pattern=None, color_mapper=None, **kwargs):
self.pattern = pattern
self.color_mapper = color_mapper
self.locator = Locator(self.pattern)

def navigator(self):
return Navigator(self.pattern)
return Navigator(self.locator)

def map_view(self):
loader = Loader(self.locator)
Expand All @@ -57,11 +43,12 @@ class Locator:
"""Locate EIDA50 satellite images"""
def __init__(self, pattern):
self.pattern = pattern
self._glob = forest.util.cached_glob(dt.timedelta(minutes=15))

def find(self, date):
if isinstance(date, (dt.datetime, str)):
date = np.datetime64(date, 's')
paths = self.paths()
paths = self.glob()
ipath = self.find_file_index(paths, date)
path = paths[ipath]
time_axis = self.load_time_axis(path)
Expand All @@ -71,16 +58,14 @@ def find(self, date):
dt.timedelta(minutes=15))
return path, index

def paths(self):
return sorted(glob.glob(os.path.expanduser(self.pattern)))
def glob(self):
return self._glob(self.pattern)

@staticmethod
@lru_cache()
def load_time_axis(path):
with netCDF4.Dataset(path) as dataset:
var = dataset.variables["time"]
values = netCDF4.num2date(
var[:], units=var.units)
with xarray.open_dataset(path, engine=ENGINE) as nc:
values = nc["time"]
return np.array(values, dtype='datetime64[s]')

def find_file_index(self, paths, user_date):
Expand Down Expand Up @@ -110,14 +95,15 @@ def find_index(times, time, length):

@staticmethod
def parse_date(path):
# reg-ex to support file names like *20191211.nc
groups = re.search(r"([0-9]{8})\.nc", path)
if groups is None:
# reg-ex to support file names like *20191211T0000Z.nc
groups = re.search(r"([0-9]{8}T[0-9]{4}Z)\.nc", path)
return dt.datetime.strptime(groups[1], "%Y%m%dT%H%MZ")
else:
return dt.datetime.strptime(groups[1], "%Y%m%d")
"""Parse timestamp into datetime or None"""
for regex, fmt in [
(r"([0-9]{8})\.nc", "%Y%m%d"),
(r"([0-9]{8}T[0-9]{4}Z)\.nc", "%Y%m%dT%H%MZ")]:
groups = re.search(regex, path)
if groups is None:
continue
else:
return dt.datetime.strptime(groups[1], fmt)


class Loader:
Expand All @@ -131,11 +117,11 @@ def __init__(self, locator):
"image": []
}
self.cache = {}
paths = self.locator.paths()
paths = self.locator.glob()
if len(paths) > 0:
with netCDF4.Dataset(paths[-1]) as dataset:
self.cache["longitude"] = dataset.variables["longitude"][:]
self.cache["latitude"] = dataset.variables["latitude"][:]
with xarray.open_dataset(paths[-1], engine=ENGINE) as nc:
self.cache["longitude"] = nc["longitude"].values
self.cache["latitude"] = nc["latitude"].values

@property
def longitudes(self):
Expand All @@ -150,7 +136,7 @@ def image(self, state):
data = self.empty_image
else:
try:
data = self._image(_to_datetime(state.valid_time))
data = self._image(forest.util.to_datetime(state.valid_time))
except (FileNotFound, IndexNotFound):
data = self.empty_image
return data
Expand All @@ -162,18 +148,20 @@ def _image(self, valid_time):
def load_image(self, path, itime):
lons = self.longitudes
lats = self.latitudes
with netCDF4.Dataset(path) as dataset:
values = dataset.variables["data"][itime]
fraction = 0.25
lons, lats, values = coarsify(
lons, lats, values, fraction)
with xarray.open_dataset(path, engine=ENGINE) as nc:
values = nc["data"][itime].values

# Use datashader to coarsify images from 4.4km to 8.8km grid
scale = 2
return geo.stretch_image(
lons, lats, values)
lons, lats, values,
plot_width=int(values.shape[1] / scale),
plot_height=int(values.shape[0] / scale))


class Navigator:
def __init__(self, pattern):
self.pattern = pattern
def __init__(self, locator):
self.locator = locator

def variables(self, pattern):
return ["EIDA50"]
Expand All @@ -182,19 +170,24 @@ def initial_times(self, pattern, variable):
return [dt.datetime(1970, 1, 1)]

def valid_times(self, pattern, variable, initial_time):
"""Get available times given application state"""
paths = self.locator.glob()
return self.valid_times_from_paths(paths)

def valid_times_from_paths(self, paths):
"""Get available times by reading files"""
arrays = []
for path in sorted(glob.glob(pattern)):
arrays.append(self._valid_times(path, variable))
for path in sorted(paths):
timestamp = self.locator.parse_date(path)
if timestamp is None:
# Time(s) from file contents
arrays.append(self.locator.load_time_axis(path))
else:
# Time(s) from file name
arrays.append(np.array([timestamp], dtype='datetime64[s]'))
if len(arrays) == 0:
return []
return np.unique(np.concatenate(arrays))

@infinite_cache
def _valid_times(self, path, variable):
with netCDF4.Dataset(path) as dataset:
var = dataset.variables["time"]
values = netCDF4.num2date(var[:], units=var.units)
return np.array(values, dtype='datetime64[s]')

def pressures(self, pattern, variable, initial_time):
return []
9 changes: 2 additions & 7 deletions forest/drivers/saf.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,23 +91,18 @@ def _image(self, long_name, initial_time, valid_time, pressures, pressure):
return data


@forest.util.timeout_cache(dt.timedelta(minutes=10))
def cached_glob(pattern):
"""Glob file system at most once every 10 minutes for a pattern"""
return sorted(glob.glob(pattern))


class Locator:
"""Locate SAF files"""
def __init__(self, pattern):
self.pattern = pattern
regex = "[0-9]{8}T[0-9]{6}Z"
fmt = "%Y%m%dT%H%M%S%Z"
self.parse_date = partial(forest.util.parse_date, regex, fmt)
self._glob = forest.util.cached_glob(dt.timedelta(minutes=10))

def glob(self):
"""List file system"""
return cached_glob(self.pattern)
return self._glob(self.pattern)

def find_paths(self, paths, date, frequency):
"""Find a file(s) containing information related to date"""
Expand Down
16 changes: 16 additions & 0 deletions forest/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import glob
import os
import re
import datetime as dt
Expand Down Expand Up @@ -35,6 +36,21 @@ def wrapped(x):
return decorator


_timeout_globs = {}


def cached_glob(interval):
"""Glob file system at most once every interval"""
global _timeout_globs
if interval not in _timeout_globs:
_timeout_globs[interval] = timeout_cache(interval)(_glob)
return _timeout_globs[interval]


def _glob(pattern):
return sorted(glob.glob(os.path.expanduser(pattern)))


def coarsify(lons, lats, values, fraction):
values = scipy.ndimage.zoom(values, fraction)
data = np.ma.masked_array(values, np.isnan(values))
Expand Down
44 changes: 22 additions & 22 deletions test/test_drivers_eida50.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
from unittest.mock import Mock
import datetime as dt
import bokeh.models
import netCDF4
Expand Down Expand Up @@ -72,10 +73,19 @@ def test_navigator_pressures():
assert navigator.pressures(None, None, None) == []


def test_locator_parse_date():
path = "/some/file-20190101.nc"
@pytest.mark.parametrize("path,expect", [
pytest.param("/some/file-20190101.nc",
dt.datetime(2019, 1, 1),
id="yyyymmdd format"),
pytest.param("/some/file-20190101T1245Z.nc",
dt.datetime(2019, 1, 1, 12, 45),
id="yyyymmdd hm format"),
pytest.param("eida50.nc",
None,
id="no timestamp"),
])
def test_locator_parse_date(path, expect):
result = eida50.Locator.parse_date(path)
expect = dt.datetime(2019, 1, 1)
assert expect == result


Expand Down Expand Up @@ -157,18 +167,6 @@ def test_locator_find_index_outside_range_raises_exception():
eida50.Locator.find_index(times, time, freq)


def test_navigator_valid_times_given_toa_brightness_temperature(tmpdir):
path = str(tmpdir / "test-navigate-eida50.nc")
times = [dt.datetime(2019, 1, 1)]
with netCDF4.Dataset(path, "w") as dataset:
_eida50(dataset, times)

navigator = eida50.Navigator(path)
result = navigator._valid_times(path, "toa_brightness_temperature")
expect = times
assert expect == result


def test_loader_image(tmpdir):
path = str(tmpdir / "file_20190417.nc")
with netCDF4.Dataset(path, "w") as dataset:
Expand All @@ -181,13 +179,6 @@ def test_loader_image(tmpdir):
assert expect == result


def test_locator_parse_date():
path = "/some/EIDA50_takm4p4_20190417.nc"
result = eida50.Locator.parse_date(path)
expect = dt.datetime(2019, 4, 17)
assert expect == result


def test_loader_longitudes(tmpdir):
path = str(tmpdir / "eida50_20190417.nc")
with netCDF4.Dataset(path, "w") as dataset:
Expand Down Expand Up @@ -244,3 +235,12 @@ def test_navigator_valid_times(tmpdir):
result = navigator.valid_times(path, variable, TIMES[0])
expect = TIMES
np.testing.assert_array_equal(expect, result)


def test_navigator_given_valid_time_none_returns_parsed_times():
paths = ["eida50_20200101.nc"]
dataset = forest.drivers.get_dataset("eida50")
navigator = dataset.navigator()
result = navigator.valid_times_from_paths(paths)
assert result == [dt.datetime(2020, 1, 1)]

0 comments on commit 2d37165

Please sign in to comment.