Skip to content

Commit

Permalink
Merge pull request pytroll#2818 from mraspaud/fix-h5-loading-unicity
Browse files Browse the repository at this point in the history
Fix uniqueness of hdf5-based dask arrays
  • Loading branch information
mraspaud authored Jun 19, 2024
2 parents 90d36d1 + 2c6af5e commit a7ff99f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
19 changes: 16 additions & 3 deletions satpy/readers/hdf5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,21 @@
"""Helpers for reading hdf5-based files."""

import logging
import os

import dask.array as da
import dask.config as dc
import h5py
import numpy as np
import xarray as xr
from dask.array.core import normalize_chunks
from dask.base import tokenize

from satpy.readers import open_file_or_filename
from satpy.readers.file_handlers import BaseFileHandler
from satpy.readers.utils import np2str
from satpy.utils import get_legacy_chunk_size

LOG = logging.getLogger(__name__)
CHUNK_SIZE = get_legacy_chunk_size()


class HDF5FileHandler(BaseFileHandler):
Expand Down Expand Up @@ -102,7 +104,7 @@ def __getitem__(self, key):
# these datasets are closed and inaccessible when the file is closed, need to reopen
f_obj = open_file_or_filename(self.filename)
dset = h5py.File(f_obj, "r")[key]
dset_data = da.from_array(dset, chunks=CHUNK_SIZE)
dset_data = from_h5_array(dset)
attrs = self._attrs_cache.get(key, dset.attrs)
if dset.ndim == 2:
return xr.DataArray(dset_data, dims=["y", "x"], attrs=attrs)
Expand All @@ -120,3 +122,14 @@ def get(self, item, default=None):
return self[item]
else:
return default


def from_h5_array(h5dset):
"""Create a dask array from an h5py dataset, ensuring uniqueness of the dask array name."""
chunk_size = dc.get("array.chunk-size")

chunks = normalize_chunks(chunk_size, dtype=h5dset.dtype, previous_chunks=h5dset.chunks, shape=h5dset.shape)
name = h5dset.name + "-" + tokenize(os.fspath(h5dset.file.filename), h5dset.name, chunks)

dset_data = da.from_array(h5dset, chunks=chunks, name=name)
return dset_data
10 changes: 10 additions & 0 deletions satpy/tests/reader_tests/test_hdf5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,13 @@ def test_all_basic(self):
assert "fake_ds" not in file_handler

assert isinstance(file_handler["ds2_f/attr/test_ref"], np.ndarray)

def test_array_name_uniqueness(self):
"""Test the dask array generated from an hdf5 dataset stay constant and unique."""
from satpy.readers.hdf5_utils import HDF5FileHandler
file_handler = HDF5FileHandler("test.h5", {}, {})

dsname = "test_group/ds1_f"

assert file_handler[dsname].data.name == file_handler[dsname].data.name
assert file_handler[dsname].data.name.startswith("/" + dsname)

0 comments on commit a7ff99f

Please sign in to comment.