From be0dffdc67bd39a344a7e00b48bb23ba27ff4a39 Mon Sep 17 00:00:00 2001 From: Leonardo Schwarz Date: Fri, 5 Jul 2024 13:00:09 +0200 Subject: [PATCH] further reorganization in persistence module --- src/depiction/persistence/__init__.py | 3 +- src/depiction/persistence/file_checksums.py | 10 +- .../persistence/imzml/imzml_reader.py | 60 +---------- src/depiction/persistence/ram/__init__.py | 0 .../persistence/{ => ram}/ram_read_file.py | 22 ++-- .../persistence/{ => ram}/ram_reader.py | 0 .../persistence/{ => ram}/ram_write_file.py | 16 +-- src/depiction/persistence/types.py | 102 ++++++++++++++++++ tests/unit/persistence/ram/__init__.py | 0 .../{ => ram}/test_ram_read_file.py | 4 +- .../persistence/{ => ram}/test_ram_reader.py | 2 +- .../{ => ram}/test_ram_write_file.py | 2 +- 12 files changed, 141 insertions(+), 80 deletions(-) create mode 100644 src/depiction/persistence/ram/__init__.py rename src/depiction/persistence/{ => ram}/ram_read_file.py (68%) rename src/depiction/persistence/{ => ram}/ram_reader.py (100%) rename src/depiction/persistence/{ => ram}/ram_write_file.py (82%) create mode 100644 src/depiction/persistence/types.py create mode 100644 tests/unit/persistence/ram/__init__.py rename tests/unit/persistence/{ => ram}/test_ram_read_file.py (94%) rename tests/unit/persistence/{ => ram}/test_ram_reader.py (98%) rename tests/unit/persistence/{ => ram}/test_ram_write_file.py (96%) diff --git a/src/depiction/persistence/__init__.py b/src/depiction/persistence/__init__.py index b471dc2..c73ef80 100644 --- a/src/depiction/persistence/__init__.py +++ b/src/depiction/persistence/__init__.py @@ -3,8 +3,7 @@ from depiction.persistence.imzml.imzml_reader import ImzmlReader from depiction.persistence.imzml.imzml_write_file import ImzmlWriteFile from depiction.persistence.imzml.imzml_writer import ImzmlWriter -from .ram_read_file import RamReadFile -from .ram_reader import RamReader +from depiction.persistence.ram.ram_read_file import RamReadFile __all__ = [ "ImzmlModeEnum", diff --git a/src/depiction/persistence/file_checksums.py b/src/depiction/persistence/file_checksums.py index 7362d4d..9d78d32 100644 --- a/src/depiction/persistence/file_checksums.py +++ b/src/depiction/persistence/file_checksums.py @@ -1,7 +1,11 @@ +from __future__ import annotations + import hashlib from functools import cached_property -from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Callable + +if TYPE_CHECKING: + from pathlib import Path class FileChecksums: @@ -32,7 +36,7 @@ def checksum_sha256(self) -> str: """The SHA-256 checksum of the file.""" return self._compute_checksum(hashlib_method=hashlib.sha256) - def _compute_checksum(self, hashlib_method: Any) -> str: + def _compute_checksum(self, hashlib_method: Callable[[], hashlib._Hash]) -> str: """Returns the checksum of the file using the native tool, or falls back to hashlib if the native tool is not available. :param hashlib_method: the hashlib method to use, e.g. `hashlib.md5` diff --git a/src/depiction/persistence/imzml/imzml_reader.py b/src/depiction/persistence/imzml/imzml_reader.py index 34ce06c..e7940e8 100644 --- a/src/depiction/persistence/imzml/imzml_reader.py +++ b/src/depiction/persistence/imzml/imzml_reader.py @@ -8,14 +8,14 @@ import pyimzml.ImzMLParser from depiction.persistence.imzml.imzml_mode_enum import ImzmlModeEnum +from depiction.persistence.types import GenericReader if TYPE_CHECKING: - from types import TracebackType from pathlib import Path from numpy.typing import NDArray -class ImzmlReader: +class ImzmlReader(GenericReader): """ Memmap based reader for imzML files, that can be pickled. """ @@ -98,14 +98,6 @@ def ibd_mmap(self) -> mmap.mmap: ) return self._ibd_mmap - def __enter__(self) -> ImzmlReader: - return self - - def __exit__( - self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None - ) -> None: - self.close() - def close(self) -> None: """Closes the .ibd file handles, if open.""" if self._ibd_mmap is not None: @@ -135,38 +127,6 @@ def coordinates(self) -> NDArray[int]: """Returns the coordinates of the spectra in the imzML file, shape (n_spectra, n_dim).""" return self._coordinates - @cached_property - def coordinates_2d(self) -> NDArray[int]: - """Returns the coordinates of the spectra in the imzML file, shape (n_spectra, 2).""" - return self.coordinates[:, :2] - - def get_spectrum(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float]]: - """Returns the m/z and intensity arrays of the i-th spectrum.""" - return self.get_spectrum_mz(i_spectrum=i_spectrum), self.get_spectrum_int(i_spectrum=i_spectrum) - - def get_spectrum_with_coords(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float], NDArray[float]]: - """Returns the m/z, intensity and v arrays of the i-th spectrum.""" - mz_arr = self.get_spectrum_mz(i_spectrum=i_spectrum) - int_arr = self.get_spectrum_int(i_spectrum=i_spectrum) - coords = self.get_spectrum_coordinates(i_spectrum=i_spectrum) - return mz_arr, int_arr, coords - - def get_spectra( - self, i_spectra: list[int] - ) -> tuple[NDArray[float] | list[NDArray[float]], NDArray[float] | list[NDArray[float]]]: - """Returns the m/z and intensity arrays of the specified spectra. - For continuous mode, the arrays are stacked into a single array, whereas - for processed mode, a list of arrays is returned as they might not have - the same shape. - """ - if self.imzml_mode == ImzmlModeEnum.CONTINUOUS: - mz_arr = self.get_spectrum_mz(i_spectrum=i_spectra[0]) - mz_arr_list = np.repeat(mz_arr[np.newaxis, :], len(i_spectra), axis=0) - int_arr_list = np.stack([self.get_spectrum_int(i_spectrum=i) for i in i_spectra], axis=0) - return mz_arr_list, int_arr_list - else: - return tuple(zip(*[self.get_spectrum(i_spectrum=i) for i in i_spectra])) - def get_spectrum_mz(self, i_spectrum: int) -> NDArray[float]: """Returns the m/z values of the i-th spectrum.""" file = self.ibd_mmap @@ -181,26 +141,10 @@ def get_spectrum_int(self, i_spectrum: int) -> NDArray[float]: int_bytes = file.read(self._int_arr_lengths[i_spectrum] * self._int_bytes) return np.frombuffer(int_bytes, dtype=self._int_arr_dtype) - def get_spectrum_coordinates(self, i_spectrum: int) -> NDArray[int]: - """Returns the coordinates of the i-th spectrum.""" - return self.coordinates[i_spectrum] - def get_spectrum_n_points(self, i_spectrum: int) -> int: """Returns the number of data points in the i-th spectrum.""" return self._int_arr_lengths[i_spectrum] - def get_spectra_mz_range(self, i_spectra: list[int] | None) -> tuple[float, float]: - """Returns the m/z range of the given spectra, returning the global min and max m/z value.""" - if i_spectra is None: - i_spectra = range(self.n_spectra) - mz_min = np.inf - mz_max = -np.inf - for i_spectrum in i_spectra: - mz_arr = self.get_spectrum_mz(i_spectrum) - mz_min = mz_arr[0] if mz_arr[0] < mz_min else mz_min - mz_max = mz_arr[-1] if mz_arr[-1] > mz_max else mz_max - return mz_min, mz_max - @classmethod def parse_imzml(cls, path: Path) -> ImzmlReader: """Parses an imzML file and returns an ImzmlReader.""" diff --git a/src/depiction/persistence/ram/__init__.py b/src/depiction/persistence/ram/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/depiction/persistence/ram_read_file.py b/src/depiction/persistence/ram/ram_read_file.py similarity index 68% rename from src/depiction/persistence/ram_read_file.py rename to src/depiction/persistence/ram/ram_read_file.py index 048b4e8..8177af2 100644 --- a/src/depiction/persistence/ram_read_file.py +++ b/src/depiction/persistence/ram/ram_read_file.py @@ -1,14 +1,24 @@ +from __future__ import annotations from contextlib import contextmanager from functools import cached_property -from collections.abc import Generator from pathlib import Path -from depiction.persistence import ImzmlModeEnum -from depiction.persistence.ram_reader import RamReader +from depiction.persistence.ram.ram_reader import RamReader +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from depiction.persistence import ImzmlModeEnum + from collections.abc import Generator + from numpy.typing import NDArray class RamReadFile: - def __init__(self, mz_arr_list, int_arr_list, coordinates) -> None: + def __init__( + self, + mz_arr_list: list[NDArray[float]] | NDArray[float], + int_arr_list: list[NDArray[float]] | NDArray[float], + coordinates: NDArray[int], + ) -> None: self._mz_arr_list = mz_arr_list self._int_arr_list = int_arr_list self._coordinates = coordinates @@ -44,9 +54,9 @@ def imzml_mode(self) -> ImzmlModeEnum: return reader.imzml_mode @property - def coordinates(self): + def coordinates(self) -> NDArray[int]: return self._coordinates @property - def coordinates_2d(self): + def coordinates_2d(self) -> NDArray[int]: return self._coordinates[:, :2] diff --git a/src/depiction/persistence/ram_reader.py b/src/depiction/persistence/ram/ram_reader.py similarity index 100% rename from src/depiction/persistence/ram_reader.py rename to src/depiction/persistence/ram/ram_reader.py diff --git a/src/depiction/persistence/ram_write_file.py b/src/depiction/persistence/ram/ram_write_file.py similarity index 82% rename from src/depiction/persistence/ram_write_file.py rename to src/depiction/persistence/ram/ram_write_file.py index 0837dee..bede5aa 100644 --- a/src/depiction/persistence/ram_write_file.py +++ b/src/depiction/persistence/ram/ram_write_file.py @@ -1,14 +1,16 @@ from __future__ import annotations -from tqdm import tqdm + from contextlib import contextmanager from typing import TYPE_CHECKING +from tqdm import tqdm from depiction.persistence import ImzmlModeEnum, RamReadFile if TYPE_CHECKING: + from numpy.typing import NDArray + from collections.abc import Generator from collections.abc import Sequence - import numpy as np class RamWriteFile: @@ -19,7 +21,7 @@ def __init__(self, imzml_mode: ImzmlModeEnum) -> None: self._imzml_mode = imzml_mode @property - def imzml_mode(self): + def imzml_mode(self) -> ImzmlModeEnum: return self._imzml_mode # Just for the sake of a clean api this does not really belong here... @@ -34,7 +36,7 @@ def imzml_mode(self): # self._coordinates.append(coordinates) @contextmanager - def writer(self): + def writer(self) -> Generator[_Writer, None, None]: yield _Writer(self) def to_read_file(self) -> RamReadFile: @@ -50,7 +52,7 @@ class _Writer: def __init__(self, file: RamWriteFile) -> None: self._file = file - def add_spectrum(self, mz_arr: np.ndarray, int_arr: np.ndarray, coordinates) -> None: + def add_spectrum(self, mz_arr: NDArray[float], int_arr: NDArray[float], coordinates: NDArray[int]) -> None: self._file._mz_arr_list.append(mz_arr) self._file._int_arr_list.append(int_arr) self._file._coordinates_list.append(coordinates) @@ -59,12 +61,12 @@ def copy_spectra(self, reader, spectra_indices: Sequence[int], tqdm_position: in # TODO reuse the implementation from ImzmlWriter as this is 100% identical if tqdm_position is not None: - def progress_fn(x): + def progress_fn(x: Sequence[int]) -> Sequence[int] | tqdm: return tqdm(x, desc=" spectrum", position=tqdm_position) else: - def progress_fn(x): + def progress_fn(x: Sequence[int]) -> Sequence[int] | tqdm: return x for i_spectrum in progress_fn(spectra_indices): diff --git a/src/depiction/persistence/types.py b/src/depiction/persistence/types.py new file mode 100644 index 0000000..973f4b4 --- /dev/null +++ b/src/depiction/persistence/types.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from functools import cached_property +from typing import TYPE_CHECKING, Self, Protocol + +import numpy as np + +from depiction.persistence.imzml.imzml_mode_enum import ImzmlModeEnum + +if TYPE_CHECKING: + from types import TracebackType + from numpy.typing import NDArray + + +# TODO better name + + +class GenericReader(Protocol): + def __enter__(self) -> Self: + return self + + def __exit__( + self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None + ) -> None: + self.close() + + def close(self) -> None: ... + + @property + def imzml_mode(self) -> ImzmlModeEnum: + """Returns the mode of the imzML file.""" + ... + + @property + def n_spectra(self) -> int: + """The number of spectra available in the .imzML file.""" + ... + + @cached_property + def coordinates(self) -> NDArray[int]: + """Returns the coordinates of the spectra in the imzML file, shape (n_spectra, n_dim).""" + ... + + @cached_property + def coordinates_2d(self) -> NDArray[int]: + """Returns the coordinates of the spectra in the imzML file, shape (n_spectra, 2).""" + return self.coordinates[:, :2] + + def get_spectrum(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float]]: + """Returns the m/z and intensity arrays of the i-th spectrum.""" + return self.get_spectrum_mz(i_spectrum=i_spectrum), self.get_spectrum_int(i_spectrum=i_spectrum) + + def get_spectrum_with_coords(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float], NDArray[float]]: + """Returns the m/z, intensity and v arrays of the i-th spectrum.""" + mz_arr = self.get_spectrum_mz(i_spectrum=i_spectrum) + int_arr = self.get_spectrum_int(i_spectrum=i_spectrum) + coords = self.get_spectrum_coordinates(i_spectrum=i_spectrum) + return mz_arr, int_arr, coords + + def get_spectra( + self, i_spectra: list[int] + ) -> tuple[NDArray[float] | list[NDArray[float]], NDArray[float] | list[NDArray[float]]]: + """Returns the m/z and intensity arrays of the specified spectra. + For continuous mode, the arrays are stacked into a single array, whereas + for processed mode, a list of arrays is returned as they might not have + the same shape. + """ + if self.imzml_mode == ImzmlModeEnum.CONTINUOUS: + mz_arr = self.get_spectrum_mz(i_spectrum=i_spectra[0]) + mz_arr_list = np.repeat(mz_arr[np.newaxis, :], len(i_spectra), axis=0) + int_arr_list = np.stack([self.get_spectrum_int(i_spectrum=i) for i in i_spectra], axis=0) + return mz_arr_list, int_arr_list + else: + return tuple(zip(*[self.get_spectrum(i_spectrum=i) for i in i_spectra])) + + def get_spectrum_mz(self, i_spectrum: int) -> NDArray[float]: + """Returns the m/z values of the i-th spectrum.""" + ... + + def get_spectrum_int(self, i_spectrum: int) -> NDArray[float]: + """Returns the intensity values of the i-th spectrum.""" + ... + + def get_spectrum_coordinates(self, i_spectrum: int) -> NDArray[int]: + """Returns the coordinates of the i-th spectrum.""" + return self.coordinates[i_spectrum] + + def get_spectrum_n_points(self, i_spectrum: int) -> int: + """Returns the number of data points in the i-th spectrum.""" + return len(self.get_spectrum_mz(i_spectrum)) + + def get_spectra_mz_range(self, i_spectra: list[int] | None) -> tuple[float, float]: + """Returns the m/z range of the given spectra, returning the global min and max m/z value.""" + if i_spectra is None: + i_spectra = range(self.n_spectra) + mz_min = np.inf + mz_max = -np.inf + for i_spectrum in i_spectra: + mz_arr = self.get_spectrum_mz(i_spectrum) + mz_min = mz_arr[0] if mz_arr[0] < mz_min else mz_min + mz_max = mz_arr[-1] if mz_arr[-1] > mz_max else mz_max + return mz_min, mz_max diff --git a/tests/unit/persistence/ram/__init__.py b/tests/unit/persistence/ram/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/persistence/test_ram_read_file.py b/tests/unit/persistence/ram/test_ram_read_file.py similarity index 94% rename from tests/unit/persistence/test_ram_read_file.py rename to tests/unit/persistence/ram/test_ram_read_file.py index 9d2d3c4..3470a6f 100644 --- a/tests/unit/persistence/test_ram_read_file.py +++ b/tests/unit/persistence/ram/test_ram_read_file.py @@ -5,7 +5,7 @@ import numpy as np from depiction.persistence import ImzmlModeEnum -from depiction.persistence.ram_read_file import RamReadFile +from depiction.persistence.ram.ram_read_file import RamReadFile class TestRamReadFile(unittest.TestCase): @@ -34,7 +34,7 @@ def test_reader(self, method_get_reader) -> None: mock_reader.close.assert_called_once_with() method_get_reader.assert_called_once_with() - @patch("depiction.persistence.ram_read_file.RamReader") + @patch("depiction.persistence.ram.ram_read_file.RamReader") def test_get_reader(self, construct_ram_reader) -> None: reader = self.mock_read_file.get_reader() construct_ram_reader.assert_called_once_with( diff --git a/tests/unit/persistence/test_ram_reader.py b/tests/unit/persistence/ram/test_ram_reader.py similarity index 98% rename from tests/unit/persistence/test_ram_reader.py rename to tests/unit/persistence/ram/test_ram_reader.py index d734752..1bac9d2 100644 --- a/tests/unit/persistence/test_ram_reader.py +++ b/tests/unit/persistence/ram/test_ram_reader.py @@ -5,7 +5,7 @@ import numpy as np from depiction.persistence import ImzmlModeEnum -from depiction.persistence.ram_reader import RamReader +from depiction.persistence.ram.ram_reader import RamReader class TestRamReader(unittest.TestCase): diff --git a/tests/unit/persistence/test_ram_write_file.py b/tests/unit/persistence/ram/test_ram_write_file.py similarity index 96% rename from tests/unit/persistence/test_ram_write_file.py rename to tests/unit/persistence/ram/test_ram_write_file.py index 66570c5..f0cb682 100644 --- a/tests/unit/persistence/test_ram_write_file.py +++ b/tests/unit/persistence/ram/test_ram_write_file.py @@ -2,7 +2,7 @@ from functools import cached_property from unittest.mock import MagicMock -from depiction.persistence.ram_write_file import RamWriteFile +from depiction.persistence.ram.ram_write_file import RamWriteFile from typing import NoReturn