Skip to content

Commit

Permalink
further reorganization in persistence module
Browse files Browse the repository at this point in the history
  • Loading branch information
leoschwarz committed Jul 5, 2024
1 parent 844812f commit be0dffd
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 80 deletions.
3 changes: 1 addition & 2 deletions src/depiction/persistence/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from depiction.persistence.imzml.imzml_reader import ImzmlReader
from depiction.persistence.imzml.imzml_write_file import ImzmlWriteFile
from depiction.persistence.imzml.imzml_writer import ImzmlWriter
from .ram_read_file import RamReadFile
from .ram_reader import RamReader
from depiction.persistence.ram.ram_read_file import RamReadFile

__all__ = [
"ImzmlModeEnum",
Expand Down
10 changes: 7 additions & 3 deletions src/depiction/persistence/file_checksums.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from __future__ import annotations

import hashlib
from functools import cached_property
from pathlib import Path
from typing import Any
from typing import TYPE_CHECKING, Callable

if TYPE_CHECKING:
from pathlib import Path


class FileChecksums:
Expand Down Expand Up @@ -32,7 +36,7 @@ def checksum_sha256(self) -> str:
"""The SHA-256 checksum of the file."""
return self._compute_checksum(hashlib_method=hashlib.sha256)

def _compute_checksum(self, hashlib_method: Any) -> str:
def _compute_checksum(self, hashlib_method: Callable[[], hashlib._Hash]) -> str:
"""Returns the checksum of the file using the native tool, or falls back to hashlib if the
native tool is not available.
:param hashlib_method: the hashlib method to use, e.g. `hashlib.md5`
Expand Down
60 changes: 2 additions & 58 deletions src/depiction/persistence/imzml/imzml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
import pyimzml.ImzMLParser

from depiction.persistence.imzml.imzml_mode_enum import ImzmlModeEnum
from depiction.persistence.types import GenericReader

if TYPE_CHECKING:
from types import TracebackType
from pathlib import Path
from numpy.typing import NDArray


class ImzmlReader:
class ImzmlReader(GenericReader):
"""
Memmap based reader for imzML files, that can be pickled.
"""
Expand Down Expand Up @@ -98,14 +98,6 @@ def ibd_mmap(self) -> mmap.mmap:
)
return self._ibd_mmap

def __enter__(self) -> ImzmlReader:
return self

def __exit__(
self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
) -> None:
self.close()

def close(self) -> None:
"""Closes the .ibd file handles, if open."""
if self._ibd_mmap is not None:
Expand Down Expand Up @@ -135,38 +127,6 @@ def coordinates(self) -> NDArray[int]:
"""Returns the coordinates of the spectra in the imzML file, shape (n_spectra, n_dim)."""
return self._coordinates

@cached_property
def coordinates_2d(self) -> NDArray[int]:
"""Returns the coordinates of the spectra in the imzML file, shape (n_spectra, 2)."""
return self.coordinates[:, :2]

def get_spectrum(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float]]:
"""Returns the m/z and intensity arrays of the i-th spectrum."""
return self.get_spectrum_mz(i_spectrum=i_spectrum), self.get_spectrum_int(i_spectrum=i_spectrum)

def get_spectrum_with_coords(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float], NDArray[float]]:
"""Returns the m/z, intensity and v arrays of the i-th spectrum."""
mz_arr = self.get_spectrum_mz(i_spectrum=i_spectrum)
int_arr = self.get_spectrum_int(i_spectrum=i_spectrum)
coords = self.get_spectrum_coordinates(i_spectrum=i_spectrum)
return mz_arr, int_arr, coords

def get_spectra(
self, i_spectra: list[int]
) -> tuple[NDArray[float] | list[NDArray[float]], NDArray[float] | list[NDArray[float]]]:
"""Returns the m/z and intensity arrays of the specified spectra.
For continuous mode, the arrays are stacked into a single array, whereas
for processed mode, a list of arrays is returned as they might not have
the same shape.
"""
if self.imzml_mode == ImzmlModeEnum.CONTINUOUS:
mz_arr = self.get_spectrum_mz(i_spectrum=i_spectra[0])
mz_arr_list = np.repeat(mz_arr[np.newaxis, :], len(i_spectra), axis=0)
int_arr_list = np.stack([self.get_spectrum_int(i_spectrum=i) for i in i_spectra], axis=0)
return mz_arr_list, int_arr_list
else:
return tuple(zip(*[self.get_spectrum(i_spectrum=i) for i in i_spectra]))

def get_spectrum_mz(self, i_spectrum: int) -> NDArray[float]:
"""Returns the m/z values of the i-th spectrum."""
file = self.ibd_mmap
Expand All @@ -181,26 +141,10 @@ def get_spectrum_int(self, i_spectrum: int) -> NDArray[float]:
int_bytes = file.read(self._int_arr_lengths[i_spectrum] * self._int_bytes)
return np.frombuffer(int_bytes, dtype=self._int_arr_dtype)

def get_spectrum_coordinates(self, i_spectrum: int) -> NDArray[int]:
"""Returns the coordinates of the i-th spectrum."""
return self.coordinates[i_spectrum]

def get_spectrum_n_points(self, i_spectrum: int) -> int:
"""Returns the number of data points in the i-th spectrum."""
return self._int_arr_lengths[i_spectrum]

def get_spectra_mz_range(self, i_spectra: list[int] | None) -> tuple[float, float]:
"""Returns the m/z range of the given spectra, returning the global min and max m/z value."""
if i_spectra is None:
i_spectra = range(self.n_spectra)
mz_min = np.inf
mz_max = -np.inf
for i_spectrum in i_spectra:
mz_arr = self.get_spectrum_mz(i_spectrum)
mz_min = mz_arr[0] if mz_arr[0] < mz_min else mz_min
mz_max = mz_arr[-1] if mz_arr[-1] > mz_max else mz_max
return mz_min, mz_max

@classmethod
def parse_imzml(cls, path: Path) -> ImzmlReader:
"""Parses an imzML file and returns an ImzmlReader."""
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
from __future__ import annotations
from contextlib import contextmanager
from functools import cached_property
from collections.abc import Generator
from pathlib import Path

from depiction.persistence import ImzmlModeEnum
from depiction.persistence.ram_reader import RamReader
from depiction.persistence.ram.ram_reader import RamReader
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from depiction.persistence import ImzmlModeEnum
from collections.abc import Generator
from numpy.typing import NDArray


class RamReadFile:
def __init__(self, mz_arr_list, int_arr_list, coordinates) -> None:
def __init__(
self,
mz_arr_list: list[NDArray[float]] | NDArray[float],
int_arr_list: list[NDArray[float]] | NDArray[float],
coordinates: NDArray[int],
) -> None:
self._mz_arr_list = mz_arr_list
self._int_arr_list = int_arr_list
self._coordinates = coordinates
Expand Down Expand Up @@ -44,9 +54,9 @@ def imzml_mode(self) -> ImzmlModeEnum:
return reader.imzml_mode

@property
def coordinates(self):
def coordinates(self) -> NDArray[int]:
return self._coordinates

@property
def coordinates_2d(self):
def coordinates_2d(self) -> NDArray[int]:
return self._coordinates[:, :2]
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from __future__ import annotations
from tqdm import tqdm

from contextlib import contextmanager
from typing import TYPE_CHECKING

from tqdm import tqdm

from depiction.persistence import ImzmlModeEnum, RamReadFile

if TYPE_CHECKING:
from numpy.typing import NDArray
from collections.abc import Generator
from collections.abc import Sequence
import numpy as np


class RamWriteFile:
Expand All @@ -19,7 +21,7 @@ def __init__(self, imzml_mode: ImzmlModeEnum) -> None:
self._imzml_mode = imzml_mode

@property
def imzml_mode(self):
def imzml_mode(self) -> ImzmlModeEnum:
return self._imzml_mode

# Just for the sake of a clean api this does not really belong here...
Expand All @@ -34,7 +36,7 @@ def imzml_mode(self):
# self._coordinates.append(coordinates)

@contextmanager
def writer(self):
def writer(self) -> Generator[_Writer, None, None]:
yield _Writer(self)

def to_read_file(self) -> RamReadFile:
Expand All @@ -50,7 +52,7 @@ class _Writer:
def __init__(self, file: RamWriteFile) -> None:
self._file = file

def add_spectrum(self, mz_arr: np.ndarray, int_arr: np.ndarray, coordinates) -> None:
def add_spectrum(self, mz_arr: NDArray[float], int_arr: NDArray[float], coordinates: NDArray[int]) -> None:
self._file._mz_arr_list.append(mz_arr)
self._file._int_arr_list.append(int_arr)
self._file._coordinates_list.append(coordinates)
Expand All @@ -59,12 +61,12 @@ def copy_spectra(self, reader, spectra_indices: Sequence[int], tqdm_position: in
# TODO reuse the implementation from ImzmlWriter as this is 100% identical
if tqdm_position is not None:

def progress_fn(x):
def progress_fn(x: Sequence[int]) -> Sequence[int] | tqdm:
return tqdm(x, desc=" spectrum", position=tqdm_position)

else:

def progress_fn(x):
def progress_fn(x: Sequence[int]) -> Sequence[int] | tqdm:
return x

for i_spectrum in progress_fn(spectra_indices):
Expand Down
102 changes: 102 additions & 0 deletions src/depiction/persistence/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from __future__ import annotations

from functools import cached_property
from typing import TYPE_CHECKING, Self, Protocol

import numpy as np

from depiction.persistence.imzml.imzml_mode_enum import ImzmlModeEnum

if TYPE_CHECKING:
from types import TracebackType
from numpy.typing import NDArray


# TODO better name


class GenericReader(Protocol):
def __enter__(self) -> Self:
return self

def __exit__(
self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
) -> None:
self.close()

def close(self) -> None: ...

@property
def imzml_mode(self) -> ImzmlModeEnum:
"""Returns the mode of the imzML file."""
...

@property
def n_spectra(self) -> int:
"""The number of spectra available in the .imzML file."""
...

@cached_property
def coordinates(self) -> NDArray[int]:
"""Returns the coordinates of the spectra in the imzML file, shape (n_spectra, n_dim)."""
...

@cached_property
def coordinates_2d(self) -> NDArray[int]:
"""Returns the coordinates of the spectra in the imzML file, shape (n_spectra, 2)."""
return self.coordinates[:, :2]

def get_spectrum(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float]]:
"""Returns the m/z and intensity arrays of the i-th spectrum."""
return self.get_spectrum_mz(i_spectrum=i_spectrum), self.get_spectrum_int(i_spectrum=i_spectrum)

def get_spectrum_with_coords(self, i_spectrum: int) -> tuple[NDArray[float], NDArray[float], NDArray[float]]:
"""Returns the m/z, intensity and v arrays of the i-th spectrum."""
mz_arr = self.get_spectrum_mz(i_spectrum=i_spectrum)
int_arr = self.get_spectrum_int(i_spectrum=i_spectrum)
coords = self.get_spectrum_coordinates(i_spectrum=i_spectrum)
return mz_arr, int_arr, coords

def get_spectra(
self, i_spectra: list[int]
) -> tuple[NDArray[float] | list[NDArray[float]], NDArray[float] | list[NDArray[float]]]:
"""Returns the m/z and intensity arrays of the specified spectra.
For continuous mode, the arrays are stacked into a single array, whereas
for processed mode, a list of arrays is returned as they might not have
the same shape.
"""
if self.imzml_mode == ImzmlModeEnum.CONTINUOUS:
mz_arr = self.get_spectrum_mz(i_spectrum=i_spectra[0])
mz_arr_list = np.repeat(mz_arr[np.newaxis, :], len(i_spectra), axis=0)
int_arr_list = np.stack([self.get_spectrum_int(i_spectrum=i) for i in i_spectra], axis=0)
return mz_arr_list, int_arr_list
else:
return tuple(zip(*[self.get_spectrum(i_spectrum=i) for i in i_spectra]))

def get_spectrum_mz(self, i_spectrum: int) -> NDArray[float]:
"""Returns the m/z values of the i-th spectrum."""
...

def get_spectrum_int(self, i_spectrum: int) -> NDArray[float]:
"""Returns the intensity values of the i-th spectrum."""
...

def get_spectrum_coordinates(self, i_spectrum: int) -> NDArray[int]:
"""Returns the coordinates of the i-th spectrum."""
return self.coordinates[i_spectrum]

def get_spectrum_n_points(self, i_spectrum: int) -> int:
"""Returns the number of data points in the i-th spectrum."""
return len(self.get_spectrum_mz(i_spectrum))

def get_spectra_mz_range(self, i_spectra: list[int] | None) -> tuple[float, float]:
"""Returns the m/z range of the given spectra, returning the global min and max m/z value."""
if i_spectra is None:
i_spectra = range(self.n_spectra)
mz_min = np.inf
mz_max = -np.inf
for i_spectrum in i_spectra:
mz_arr = self.get_spectrum_mz(i_spectrum)
mz_min = mz_arr[0] if mz_arr[0] < mz_min else mz_min
mz_max = mz_arr[-1] if mz_arr[-1] > mz_max else mz_max
return mz_min, mz_max
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np

from depiction.persistence import ImzmlModeEnum
from depiction.persistence.ram_read_file import RamReadFile
from depiction.persistence.ram.ram_read_file import RamReadFile


class TestRamReadFile(unittest.TestCase):
Expand Down Expand Up @@ -34,7 +34,7 @@ def test_reader(self, method_get_reader) -> None:
mock_reader.close.assert_called_once_with()
method_get_reader.assert_called_once_with()

@patch("depiction.persistence.ram_read_file.RamReader")
@patch("depiction.persistence.ram.ram_read_file.RamReader")
def test_get_reader(self, construct_ram_reader) -> None:
reader = self.mock_read_file.get_reader()
construct_ram_reader.assert_called_once_with(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np

from depiction.persistence import ImzmlModeEnum
from depiction.persistence.ram_reader import RamReader
from depiction.persistence.ram.ram_reader import RamReader


class TestRamReader(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import cached_property
from unittest.mock import MagicMock

from depiction.persistence.ram_write_file import RamWriteFile
from depiction.persistence.ram.ram_write_file import RamWriteFile
from typing import NoReturn


Expand Down

0 comments on commit be0dffd

Please sign in to comment.