Skip to content

Commit

Permalink
improve file_checksums implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
leoschwarz committed Jul 1, 2024
1 parent 6638330 commit b77cd0c
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 98 deletions.
55 changes: 55 additions & 0 deletions examples/file_checksums_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import hashlib
import shutil
import subprocess
import timeit
from pathlib import Path


def checksum_native(file: Path) -> str:
binary_path = shutil.which("sha1sum")
result = subprocess.run(
[binary_path, str(file)],
capture_output=True,
text=True,
encoding="utf-8",
check=True,
)
return result.stdout.split()[0].lower()


def checksum_naive(file: Path) -> str:
return hashlib.sha1(file.read_bytes()).hexdigest()


def checksum_chunked(file: Path, chunksize=4096) -> str:
hasher = hashlib.sha1()
with open(file, "rb") as f:
for chunk in iter(lambda: f.read(chunksize), b""):
hasher.update(chunk)
return hasher.hexdigest()


def main():
# create test file of size 200 MiB
file = Path("testfile.png")
with file.open("wb") as f:
f.seek(200 * 1024 * 1024 - 1)
f.write(b"\0")

# sanity check
print("Performing sanity check")
assert checksum_naive(file) == checksum_chunked(file) == checksum_native(file)
print("Sanity check passed")

# benchmark
print("Naive:", timeit.timeit(lambda: checksum_naive(file), number=10))
print("Chunked 4096:", timeit.timeit(lambda: checksum_chunked(file), number=10))
print("Chunked 8192:", timeit.timeit(lambda: checksum_chunked(file, chunksize=8192), number=10))
print("Chunked 16384:", timeit.timeit(lambda: checksum_chunked(file, chunksize=16384), number=10))
print("Chunked 30000:", timeit.timeit(lambda: checksum_chunked(file, chunksize=30000), number=10))
print("Chunked 50000:", timeit.timeit(lambda: checksum_chunked(file, chunksize=50000), number=10))
print("Native:", timeit.timeit(lambda: checksum_native(file), number=10))


if __name__ == "__main__":
main()
44 changes: 10 additions & 34 deletions src/depiction/persistence/file_checksums.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import hashlib
import shutil
import subprocess
from functools import cached_property
from pathlib import Path
from typing import Any, Optional
from typing import Any


class FileChecksums:
Expand All @@ -17,47 +15,25 @@ def file_path(self) -> Path:
@cached_property
def checksum_md5(self) -> str:
"""The MD5 checksum of the file."""
return self._compute_checksum(native_tool="md5sum", hashlib_method=hashlib.md5)
return self._compute_checksum(hashlib_method=hashlib.md5)

@cached_property
def checksum_sha1(self) -> str:
"""The SHA-1 checksum of the file."""
return self._compute_checksum(native_tool="sha1sum", hashlib_method=hashlib.sha1)
return self._compute_checksum(hashlib_method=hashlib.sha1)

@cached_property
def checksum_sha256(self) -> str:
"""The SHA-256 checksum of the file."""
return self._compute_checksum(native_tool="sha256sum", hashlib_method=hashlib.sha256)
return self._compute_checksum(hashlib_method=hashlib.sha256)

def _compute_checksum(self, native_tool: str, hashlib_method: Any) -> str:
def _compute_checksum(self, hashlib_method: Any) -> str:
"""Returns the checksum of the file using the native tool, or falls back to hashlib if the
native tool is not available.
:param native_tool: the name of the binary tool to use, e.g. `md5sum`
:param hashlib_method: the hashlib method to use, e.g. `hashlib.md5`
"""
# default to the native unix tool since these are usually much faster than python's hashlib
checksum = self._compute_checksum_native_tool(binary_name=native_tool, file=self.file_path)
if checksum is not None:
return checksum

# fallback to the hashlib method
return hashlib_method(self._file_path.read_bytes()).hexdigest()

def _compute_checksum_native_tool(self, binary_name: str, file: Path) -> Optional[str]:
"""Returns the checksum of the file using the native tool, or None if the tool is not available.
The checksum is returned as a string in lower case.
:param binary_name: the name of the binary tool to use
:param file: the file to compute the checksum for
"""
binary_path = shutil.which(binary_name)
if binary_path is None:
return None
else:
result = subprocess.run(
[binary_path, str(file)],
capture_output=True,
text=True,
encoding="utf-8",
check=True,
)
return result.stdout.split()[0].lower()
hasher = hashlib_method()
with self._file_path.open("rb") as f:
for chunk in iter(lambda: f.read(16384), b""):
hasher.update(chunk)
return hasher.hexdigest()
113 changes: 49 additions & 64 deletions tests/unit/persistence/test_file_checksums.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,58 @@
import hashlib
import unittest
from functools import cached_property
import io
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest
from pytest_mock import MockFixture

from depiction.persistence.file_checksums import FileChecksums


class TestFileChecksums(unittest.TestCase):
def setUp(self) -> None:
self.mock_file_path = MagicMock(name="file_path", spec=Path)

@cached_property
def mock_checksums(self) -> FileChecksums:
return FileChecksums(file_path=self.mock_file_path)

def test_file_path(self) -> None:
self.assertEqual(self.mock_file_path, self.mock_checksums.file_path)

@patch.object(FileChecksums, "_compute_checksum")
def test_checksum_md5(self, method_compute_checksum) -> None:
self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_md5)
method_compute_checksum.assert_called_once_with(native_tool="md5sum", hashlib_method=hashlib.md5)

@patch.object(FileChecksums, "_compute_checksum")
def test_checksum_sha1(self, method_compute_checksum) -> None:
self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_sha1)
method_compute_checksum.assert_called_once_with(native_tool="sha1sum", hashlib_method=hashlib.sha1)

@patch.object(FileChecksums, "_compute_checksum")
def test_checksum_sha256(self, method_compute_checksum) -> None:
self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_sha256)
method_compute_checksum.assert_called_once_with(native_tool="sha256sum", hashlib_method=hashlib.sha256)

@patch("shutil.which")
@patch("subprocess.run")
def test_checksum_when_native_tool_available(self, mock_subprocess_run, mock_shutil_which) -> None:
mock_shutil_which.return_value = "some/path"
mock_subprocess_run.return_value.stdout = "checksum"
mock_hashlib_method = MagicMock(name="mock_hashlib_method")
self.mock_file_path = Path("/dev/null/hello")
self.assertEqual(
"checksum", self.mock_checksums._compute_checksum(native_tool="tool", hashlib_method=mock_hashlib_method)
)
mock_subprocess_run.assert_called_once_with(
["some/path", "/dev/null/hello"],
capture_output=True,
text=True,
encoding="utf-8",
check=True,
)
mock_shutil_which.assert_called_once_with("tool")

@patch("shutil.which")
def test_checksum_when_native_tool_not_available(self, mock_shutil_which) -> None:
mock_shutil_which.return_value = None
self.mock_file_path.read_bytes.return_value = b"content"
mock_hashlib_method = MagicMock(name="mock_hashlib_method")

self.assertEqual(
mock_hashlib_method.return_value.hexdigest.return_value,
self.mock_checksums._compute_checksum(native_tool="tool", hashlib_method=mock_hashlib_method),
)

mock_shutil_which.assert_called_once_with("tool")
self.mock_file_path.read_bytes.assert_called_once_with()
mock_hashlib_method.assert_called_once_with(b"content")
mock_hashlib_method.return_value.hexdigest.assert_called_once_with()
@pytest.fixture
def mock_file_path(mocker: MockFixture):
return mocker.MagicMock(name="file_path", spec=Path)


@pytest.fixture
def mock_checksums(mock_file_path) -> FileChecksums:
return FileChecksums(file_path=mock_file_path)


def test_file_path(mock_file_path, mock_checksums) -> None:
assert mock_checksums.file_path == mock_file_path


def test_checksum_md5(mocker: MockFixture, mock_checksums: FileChecksums) -> None:
mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum")
assert mock_checksums.checksum_md5 == mock_compute_checksum.return_value
mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.md5)


def test_compute_checksum_sha1(mocker: MockFixture, mock_checksums: FileChecksums) -> None:
mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum")
assert mock_checksums.checksum_sha1 == mock_compute_checksum.return_value
mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.sha1)


def test_compute_checksum_sha256(mocker: MockFixture, mock_checksums: FileChecksums) -> None:
mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum")
assert mock_checksums.checksum_sha256 == mock_compute_checksum.return_value
mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.sha256)


def test_compute_checksum(mocker: MockFixture) -> None:
mock_file_path = mocker.MagicMock(name="file_path", spec=Path)
mock_file_path.open.return_value.__enter__.return_value = io.BytesIO(b"content")
mock_hashlib_method = mocker.MagicMock(name="mock_hashlib_method")
mock_checksums = FileChecksums(file_path=mock_file_path)

checksum = mock_checksums._compute_checksum(hashlib_method=mock_hashlib_method)
assert checksum == mock_hashlib_method.return_value.hexdigest.return_value

mock_file_path.open.assert_called_once_with("rb")
mock_hashlib_method.assert_called_once_with()
mock_hashlib_method.return_value.update.assert_called_once_with(b"content")


if __name__ == "__main__":
unittest.main()
pytest.main()

0 comments on commit b77cd0c

Please sign in to comment.