From b77cd0ccf88a34a7fde97ed48abe3f11227b0f42 Mon Sep 17 00:00:00 2001 From: Leonardo Schwarz Date: Mon, 1 Jul 2024 14:57:00 +0200 Subject: [PATCH] improve file_checksums implementation --- examples/file_checksums_benchmark.py | 55 +++++++++ src/depiction/persistence/file_checksums.py | 44 ++----- tests/unit/persistence/test_file_checksums.py | 113 ++++++++---------- 3 files changed, 114 insertions(+), 98 deletions(-) create mode 100644 examples/file_checksums_benchmark.py diff --git a/examples/file_checksums_benchmark.py b/examples/file_checksums_benchmark.py new file mode 100644 index 0000000..18bdec7 --- /dev/null +++ b/examples/file_checksums_benchmark.py @@ -0,0 +1,55 @@ +import hashlib +import shutil +import subprocess +import timeit +from pathlib import Path + + +def checksum_native(file: Path) -> str: + binary_path = shutil.which("sha1sum") + result = subprocess.run( + [binary_path, str(file)], + capture_output=True, + text=True, + encoding="utf-8", + check=True, + ) + return result.stdout.split()[0].lower() + + +def checksum_naive(file: Path) -> str: + return hashlib.sha1(file.read_bytes()).hexdigest() + + +def checksum_chunked(file: Path, chunksize=4096) -> str: + hasher = hashlib.sha1() + with open(file, "rb") as f: + for chunk in iter(lambda: f.read(chunksize), b""): + hasher.update(chunk) + return hasher.hexdigest() + + +def main(): + # create test file of size 200 MiB + file = Path("testfile.png") + with file.open("wb") as f: + f.seek(200 * 1024 * 1024 - 1) + f.write(b"\0") + + # sanity check + print("Performing sanity check") + assert checksum_naive(file) == checksum_chunked(file) == checksum_native(file) + print("Sanity check passed") + + # benchmark + print("Naive:", timeit.timeit(lambda: checksum_naive(file), number=10)) + print("Chunked 4096:", timeit.timeit(lambda: checksum_chunked(file), number=10)) + print("Chunked 8192:", timeit.timeit(lambda: checksum_chunked(file, chunksize=8192), number=10)) + print("Chunked 16384:", timeit.timeit(lambda: checksum_chunked(file, chunksize=16384), number=10)) + print("Chunked 30000:", timeit.timeit(lambda: checksum_chunked(file, chunksize=30000), number=10)) + print("Chunked 50000:", timeit.timeit(lambda: checksum_chunked(file, chunksize=50000), number=10)) + print("Native:", timeit.timeit(lambda: checksum_native(file), number=10)) + + +if __name__ == "__main__": + main() diff --git a/src/depiction/persistence/file_checksums.py b/src/depiction/persistence/file_checksums.py index 9d9203b..0ec1b90 100644 --- a/src/depiction/persistence/file_checksums.py +++ b/src/depiction/persistence/file_checksums.py @@ -1,9 +1,7 @@ import hashlib -import shutil -import subprocess from functools import cached_property from pathlib import Path -from typing import Any, Optional +from typing import Any class FileChecksums: @@ -17,47 +15,25 @@ def file_path(self) -> Path: @cached_property def checksum_md5(self) -> str: """The MD5 checksum of the file.""" - return self._compute_checksum(native_tool="md5sum", hashlib_method=hashlib.md5) + return self._compute_checksum(hashlib_method=hashlib.md5) @cached_property def checksum_sha1(self) -> str: """The SHA-1 checksum of the file.""" - return self._compute_checksum(native_tool="sha1sum", hashlib_method=hashlib.sha1) + return self._compute_checksum(hashlib_method=hashlib.sha1) @cached_property def checksum_sha256(self) -> str: """The SHA-256 checksum of the file.""" - return self._compute_checksum(native_tool="sha256sum", hashlib_method=hashlib.sha256) + return self._compute_checksum(hashlib_method=hashlib.sha256) - def _compute_checksum(self, native_tool: str, hashlib_method: Any) -> str: + def _compute_checksum(self, hashlib_method: Any) -> str: """Returns the checksum of the file using the native tool, or falls back to hashlib if the native tool is not available. - :param native_tool: the name of the binary tool to use, e.g. `md5sum` :param hashlib_method: the hashlib method to use, e.g. `hashlib.md5` """ - # default to the native unix tool since these are usually much faster than python's hashlib - checksum = self._compute_checksum_native_tool(binary_name=native_tool, file=self.file_path) - if checksum is not None: - return checksum - - # fallback to the hashlib method - return hashlib_method(self._file_path.read_bytes()).hexdigest() - - def _compute_checksum_native_tool(self, binary_name: str, file: Path) -> Optional[str]: - """Returns the checksum of the file using the native tool, or None if the tool is not available. - The checksum is returned as a string in lower case. - :param binary_name: the name of the binary tool to use - :param file: the file to compute the checksum for - """ - binary_path = shutil.which(binary_name) - if binary_path is None: - return None - else: - result = subprocess.run( - [binary_path, str(file)], - capture_output=True, - text=True, - encoding="utf-8", - check=True, - ) - return result.stdout.split()[0].lower() + hasher = hashlib_method() + with self._file_path.open("rb") as f: + for chunk in iter(lambda: f.read(16384), b""): + hasher.update(chunk) + return hasher.hexdigest() diff --git a/tests/unit/persistence/test_file_checksums.py b/tests/unit/persistence/test_file_checksums.py index 9e09345..8445e2b 100644 --- a/tests/unit/persistence/test_file_checksums.py +++ b/tests/unit/persistence/test_file_checksums.py @@ -1,73 +1,58 @@ import hashlib -import unittest -from functools import cached_property +import io from pathlib import Path -from unittest.mock import MagicMock, patch + +import pytest +from pytest_mock import MockFixture from depiction.persistence.file_checksums import FileChecksums -class TestFileChecksums(unittest.TestCase): - def setUp(self) -> None: - self.mock_file_path = MagicMock(name="file_path", spec=Path) - - @cached_property - def mock_checksums(self) -> FileChecksums: - return FileChecksums(file_path=self.mock_file_path) - - def test_file_path(self) -> None: - self.assertEqual(self.mock_file_path, self.mock_checksums.file_path) - - @patch.object(FileChecksums, "_compute_checksum") - def test_checksum_md5(self, method_compute_checksum) -> None: - self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_md5) - method_compute_checksum.assert_called_once_with(native_tool="md5sum", hashlib_method=hashlib.md5) - - @patch.object(FileChecksums, "_compute_checksum") - def test_checksum_sha1(self, method_compute_checksum) -> None: - self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_sha1) - method_compute_checksum.assert_called_once_with(native_tool="sha1sum", hashlib_method=hashlib.sha1) - - @patch.object(FileChecksums, "_compute_checksum") - def test_checksum_sha256(self, method_compute_checksum) -> None: - self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_sha256) - method_compute_checksum.assert_called_once_with(native_tool="sha256sum", hashlib_method=hashlib.sha256) - - @patch("shutil.which") - @patch("subprocess.run") - def test_checksum_when_native_tool_available(self, mock_subprocess_run, mock_shutil_which) -> None: - mock_shutil_which.return_value = "some/path" - mock_subprocess_run.return_value.stdout = "checksum" - mock_hashlib_method = MagicMock(name="mock_hashlib_method") - self.mock_file_path = Path("/dev/null/hello") - self.assertEqual( - "checksum", self.mock_checksums._compute_checksum(native_tool="tool", hashlib_method=mock_hashlib_method) - ) - mock_subprocess_run.assert_called_once_with( - ["some/path", "/dev/null/hello"], - capture_output=True, - text=True, - encoding="utf-8", - check=True, - ) - mock_shutil_which.assert_called_once_with("tool") - - @patch("shutil.which") - def test_checksum_when_native_tool_not_available(self, mock_shutil_which) -> None: - mock_shutil_which.return_value = None - self.mock_file_path.read_bytes.return_value = b"content" - mock_hashlib_method = MagicMock(name="mock_hashlib_method") - - self.assertEqual( - mock_hashlib_method.return_value.hexdigest.return_value, - self.mock_checksums._compute_checksum(native_tool="tool", hashlib_method=mock_hashlib_method), - ) - - mock_shutil_which.assert_called_once_with("tool") - self.mock_file_path.read_bytes.assert_called_once_with() - mock_hashlib_method.assert_called_once_with(b"content") - mock_hashlib_method.return_value.hexdigest.assert_called_once_with() +@pytest.fixture +def mock_file_path(mocker: MockFixture): + return mocker.MagicMock(name="file_path", spec=Path) + + +@pytest.fixture +def mock_checksums(mock_file_path) -> FileChecksums: + return FileChecksums(file_path=mock_file_path) + + +def test_file_path(mock_file_path, mock_checksums) -> None: + assert mock_checksums.file_path == mock_file_path + + +def test_checksum_md5(mocker: MockFixture, mock_checksums: FileChecksums) -> None: + mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum") + assert mock_checksums.checksum_md5 == mock_compute_checksum.return_value + mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.md5) + + +def test_compute_checksum_sha1(mocker: MockFixture, mock_checksums: FileChecksums) -> None: + mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum") + assert mock_checksums.checksum_sha1 == mock_compute_checksum.return_value + mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.sha1) + + +def test_compute_checksum_sha256(mocker: MockFixture, mock_checksums: FileChecksums) -> None: + mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum") + assert mock_checksums.checksum_sha256 == mock_compute_checksum.return_value + mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.sha256) + + +def test_compute_checksum(mocker: MockFixture) -> None: + mock_file_path = mocker.MagicMock(name="file_path", spec=Path) + mock_file_path.open.return_value.__enter__.return_value = io.BytesIO(b"content") + mock_hashlib_method = mocker.MagicMock(name="mock_hashlib_method") + mock_checksums = FileChecksums(file_path=mock_file_path) + + checksum = mock_checksums._compute_checksum(hashlib_method=mock_hashlib_method) + assert checksum == mock_hashlib_method.return_value.hexdigest.return_value + + mock_file_path.open.assert_called_once_with("rb") + mock_hashlib_method.assert_called_once_with() + mock_hashlib_method.return_value.update.assert_called_once_with(b"content") if __name__ == "__main__": - unittest.main() + pytest.main()