-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
improve file_checksums implementation
- Loading branch information
1 parent
6638330
commit b77cd0c
Showing
3 changed files
with
114 additions
and
98 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import hashlib | ||
import shutil | ||
import subprocess | ||
import timeit | ||
from pathlib import Path | ||
|
||
|
||
def checksum_native(file: Path) -> str: | ||
binary_path = shutil.which("sha1sum") | ||
result = subprocess.run( | ||
[binary_path, str(file)], | ||
capture_output=True, | ||
text=True, | ||
encoding="utf-8", | ||
check=True, | ||
) | ||
return result.stdout.split()[0].lower() | ||
|
||
|
||
def checksum_naive(file: Path) -> str: | ||
return hashlib.sha1(file.read_bytes()).hexdigest() | ||
|
||
|
||
def checksum_chunked(file: Path, chunksize=4096) -> str: | ||
hasher = hashlib.sha1() | ||
with open(file, "rb") as f: | ||
for chunk in iter(lambda: f.read(chunksize), b""): | ||
hasher.update(chunk) | ||
return hasher.hexdigest() | ||
|
||
|
||
def main(): | ||
# create test file of size 200 MiB | ||
file = Path("testfile.png") | ||
with file.open("wb") as f: | ||
f.seek(200 * 1024 * 1024 - 1) | ||
f.write(b"\0") | ||
|
||
# sanity check | ||
print("Performing sanity check") | ||
assert checksum_naive(file) == checksum_chunked(file) == checksum_native(file) | ||
print("Sanity check passed") | ||
|
||
# benchmark | ||
print("Naive:", timeit.timeit(lambda: checksum_naive(file), number=10)) | ||
print("Chunked 4096:", timeit.timeit(lambda: checksum_chunked(file), number=10)) | ||
print("Chunked 8192:", timeit.timeit(lambda: checksum_chunked(file, chunksize=8192), number=10)) | ||
print("Chunked 16384:", timeit.timeit(lambda: checksum_chunked(file, chunksize=16384), number=10)) | ||
print("Chunked 30000:", timeit.timeit(lambda: checksum_chunked(file, chunksize=30000), number=10)) | ||
print("Chunked 50000:", timeit.timeit(lambda: checksum_chunked(file, chunksize=50000), number=10)) | ||
print("Native:", timeit.timeit(lambda: checksum_native(file), number=10)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,58 @@ | ||
import hashlib | ||
import unittest | ||
from functools import cached_property | ||
import io | ||
from pathlib import Path | ||
from unittest.mock import MagicMock, patch | ||
|
||
import pytest | ||
from pytest_mock import MockFixture | ||
|
||
from depiction.persistence.file_checksums import FileChecksums | ||
|
||
|
||
class TestFileChecksums(unittest.TestCase): | ||
def setUp(self) -> None: | ||
self.mock_file_path = MagicMock(name="file_path", spec=Path) | ||
|
||
@cached_property | ||
def mock_checksums(self) -> FileChecksums: | ||
return FileChecksums(file_path=self.mock_file_path) | ||
|
||
def test_file_path(self) -> None: | ||
self.assertEqual(self.mock_file_path, self.mock_checksums.file_path) | ||
|
||
@patch.object(FileChecksums, "_compute_checksum") | ||
def test_checksum_md5(self, method_compute_checksum) -> None: | ||
self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_md5) | ||
method_compute_checksum.assert_called_once_with(native_tool="md5sum", hashlib_method=hashlib.md5) | ||
|
||
@patch.object(FileChecksums, "_compute_checksum") | ||
def test_checksum_sha1(self, method_compute_checksum) -> None: | ||
self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_sha1) | ||
method_compute_checksum.assert_called_once_with(native_tool="sha1sum", hashlib_method=hashlib.sha1) | ||
|
||
@patch.object(FileChecksums, "_compute_checksum") | ||
def test_checksum_sha256(self, method_compute_checksum) -> None: | ||
self.assertEqual(method_compute_checksum.return_value, self.mock_checksums.checksum_sha256) | ||
method_compute_checksum.assert_called_once_with(native_tool="sha256sum", hashlib_method=hashlib.sha256) | ||
|
||
@patch("shutil.which") | ||
@patch("subprocess.run") | ||
def test_checksum_when_native_tool_available(self, mock_subprocess_run, mock_shutil_which) -> None: | ||
mock_shutil_which.return_value = "some/path" | ||
mock_subprocess_run.return_value.stdout = "checksum" | ||
mock_hashlib_method = MagicMock(name="mock_hashlib_method") | ||
self.mock_file_path = Path("/dev/null/hello") | ||
self.assertEqual( | ||
"checksum", self.mock_checksums._compute_checksum(native_tool="tool", hashlib_method=mock_hashlib_method) | ||
) | ||
mock_subprocess_run.assert_called_once_with( | ||
["some/path", "/dev/null/hello"], | ||
capture_output=True, | ||
text=True, | ||
encoding="utf-8", | ||
check=True, | ||
) | ||
mock_shutil_which.assert_called_once_with("tool") | ||
|
||
@patch("shutil.which") | ||
def test_checksum_when_native_tool_not_available(self, mock_shutil_which) -> None: | ||
mock_shutil_which.return_value = None | ||
self.mock_file_path.read_bytes.return_value = b"content" | ||
mock_hashlib_method = MagicMock(name="mock_hashlib_method") | ||
|
||
self.assertEqual( | ||
mock_hashlib_method.return_value.hexdigest.return_value, | ||
self.mock_checksums._compute_checksum(native_tool="tool", hashlib_method=mock_hashlib_method), | ||
) | ||
|
||
mock_shutil_which.assert_called_once_with("tool") | ||
self.mock_file_path.read_bytes.assert_called_once_with() | ||
mock_hashlib_method.assert_called_once_with(b"content") | ||
mock_hashlib_method.return_value.hexdigest.assert_called_once_with() | ||
@pytest.fixture | ||
def mock_file_path(mocker: MockFixture): | ||
return mocker.MagicMock(name="file_path", spec=Path) | ||
|
||
|
||
@pytest.fixture | ||
def mock_checksums(mock_file_path) -> FileChecksums: | ||
return FileChecksums(file_path=mock_file_path) | ||
|
||
|
||
def test_file_path(mock_file_path, mock_checksums) -> None: | ||
assert mock_checksums.file_path == mock_file_path | ||
|
||
|
||
def test_checksum_md5(mocker: MockFixture, mock_checksums: FileChecksums) -> None: | ||
mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum") | ||
assert mock_checksums.checksum_md5 == mock_compute_checksum.return_value | ||
mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.md5) | ||
|
||
|
||
def test_compute_checksum_sha1(mocker: MockFixture, mock_checksums: FileChecksums) -> None: | ||
mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum") | ||
assert mock_checksums.checksum_sha1 == mock_compute_checksum.return_value | ||
mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.sha1) | ||
|
||
|
||
def test_compute_checksum_sha256(mocker: MockFixture, mock_checksums: FileChecksums) -> None: | ||
mock_compute_checksum = mocker.patch.object(FileChecksums, "_compute_checksum") | ||
assert mock_checksums.checksum_sha256 == mock_compute_checksum.return_value | ||
mock_compute_checksum.assert_called_once_with(hashlib_method=hashlib.sha256) | ||
|
||
|
||
def test_compute_checksum(mocker: MockFixture) -> None: | ||
mock_file_path = mocker.MagicMock(name="file_path", spec=Path) | ||
mock_file_path.open.return_value.__enter__.return_value = io.BytesIO(b"content") | ||
mock_hashlib_method = mocker.MagicMock(name="mock_hashlib_method") | ||
mock_checksums = FileChecksums(file_path=mock_file_path) | ||
|
||
checksum = mock_checksums._compute_checksum(hashlib_method=mock_hashlib_method) | ||
assert checksum == mock_hashlib_method.return_value.hexdigest.return_value | ||
|
||
mock_file_path.open.assert_called_once_with("rb") | ||
mock_hashlib_method.assert_called_once_with() | ||
mock_hashlib_method.return_value.update.assert_called_once_with(b"content") | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() | ||
pytest.main() |