Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inference data loader #81

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ml4gw/dataloading/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .chunked_dataset import ChunkedTimeSeriesDataset
from .hdf5_dataset import Hdf5TimeSeriesDataset
from .in_memory_dataset import InMemoryDataset
from .inference_dataset import InferenceDataset
77 changes: 77 additions & 0 deletions ml4gw/dataloading/inference_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import math
from typing import Optional, Sequence

import h5py
import numpy as np


class InferenceDataset:
def __init__(
self,
fname: str,
channels: Sequence[str],
stride_size: int,
shift_sizes: Optional[Sequence[int]] = None,
):
"""
Simple Iterable dataset that chronologically loads
`stride_size` windows of timeseries data.
If `shift_sizes` is provided, the dataset will
also yield windows that are shifted by the specified amounts.

It is _strongly_ recommended that these files have been
written using [chunked storage]
(https://docs.h5py.org/en/stable/high/dataset.html#chunked-storage).
This has shown to produce increases in read-time speeds
of over an order of magnitude.

Args:
fname:
Paths to HDF5 file from which to load data.
channels:
Datasets to read from the indicated files, which
will be stacked along dim 1 of the generated batches
during iteration.
stride_size:
Size of the windows to read and yield at each step
shift_sizes:
List of shift sizes to apply to each channel. If `None`,
no shifts will be applied.
"""

self.fname = fname
self.stride_size = stride_size
self.channels = channels

if shift_sizes is not None:
if len(shift_sizes) != len(channels):
raise ValueError("Shifts must be the same length as channels")
self.shift_sizes = shift_sizes or [0] * len(channels)
with h5py.File(fname, "r") as f:
dset = f[channels[0]]
self.size = len(dset) - self.max_shift

def __len__(self):
return math.ceil(self.size / self.stride_size)

@property
def max_shift(self):
return max(self.shift_sizes)

def __iter__(self):
with h5py.File(self.fname, "r") as f:
idx = 0
while idx < self.size:
data = []
for channel, shift in zip(self.channels, self.shift_sizes):
start = idx + shift
stop = start + self.stride_size

# make sure that segments with shifts shorter
# than the max shift get their ends cut off
stop = min(self.size + shift, stop)
x = f[channel][start:stop]
data.append(x)

yield np.stack(data)
idx += self.stride_size
Loading