Skip to content

Commit

Permalink
Do not re-download attachment if exit on disk (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrekkr authored Oct 17, 2024
1 parent dd194d3 commit c78092f
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 83 deletions.
89 changes: 33 additions & 56 deletions src/salesforce_archivist/salesforce/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,78 +206,55 @@ def __init__(
self._stop_event = threading.Event()
self._max_workers = max_workers

def _download_content_version_from_sf(
self, downloaded_versions_list: DownloadedList, version: ContentVersion, download_path: str
def _download_file_from_sf_api(
self, downloaded_list: DownloadedList, download_obj: Union[ContentVersion, Attachment], download_path: str
) -> None:
downloaded_version = downloaded_versions_list.get(version)
pass

def download_file_from_sf(
self,
downloaded_list: DownloadedList,
download_obj: Union[ContentVersion, Attachment],
download_path: str,
) -> None:
downloaded_file = downloaded_list.get(download_obj)
# file exist under the path that we want to download into
if os.path.exists(download_path):
# if no version exist in downloaded list add a new downloaded version with this path
if downloaded_version is None:
downloaded_version = DownloadedSalesforceObject(
obj_id=version.id,
# if no file exist in downloaded list add a new downloaded version with this path
if downloaded_file is None:
downloaded_file = DownloadedSalesforceObject(
obj_id=download_obj.id,
path=download_path,
)
downloaded_versions_list.add(downloaded_version)
downloaded_list.add(downloaded_file)

# version is on downloaded list and version points to existing file on disk
elif downloaded_version is not None and os.path.exists(downloaded_version.path):
# copy existing file if download path is different from already downloaded version path in the list
if downloaded_version.path != download_path:
# file is on downloaded list and it points to existing file on disk
elif downloaded_file is not None and os.path.exists(downloaded_file.path):
# copy existing file if download path is different from already downloaded path in the list
if downloaded_file.path != download_path:
os.makedirs(os.path.dirname(download_path), exist_ok=True)
shutil.copy(downloaded_version.path, download_path)
shutil.copy(downloaded_file.path, download_path)

# download version using SF API and add to the list
# download file using SF API and add to the list
else:
result = self._client.download_content_version(version)
if isinstance(download_obj, ContentVersion):
result = self._client.download_content_version(download_obj)
elif isinstance(download_obj, Attachment):
result = self._client.download_attachment(download_obj)
else:
raise ValueError("Unknown object type provided {type}".format(type=type(download_obj)))

os.makedirs(os.path.dirname(download_path), exist_ok=True)
with open(download_path, "wb") as file:
for chunk in result.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)

downloaded_version = DownloadedSalesforceObject(
obj_id=version.id,
downloaded_file = DownloadedSalesforceObject(
obj_id=download_obj.id,
path=download_path,
)
downloaded_versions_list.add(downloaded_version)

def _download_attachment_from_sf(
self, downloaded_attachment_list: DownloadedList, attachment: Attachment, download_path: str
) -> None:
result = self._client.download_attachment(attachment)
os.makedirs(os.path.dirname(download_path), exist_ok=True)
with open(download_path, "wb") as file:
for chunk in result.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)

downloaded_attachment = DownloadedSalesforceObject(
obj_id=attachment.id,
path=download_path,
)
downloaded_attachment_list.add(downloaded_attachment)

def download_from_sf(
self,
downloaded_list: DownloadedList,
download_obj: Union[ContentVersion, Attachment],
download_path: str,
) -> None:
if isinstance(download_obj, ContentVersion):
self._download_content_version_from_sf(
downloaded_versions_list=downloaded_list,
version=download_obj,
download_path=download_path,
)
elif isinstance(download_obj, Attachment):
self._download_attachment_from_sf(
downloaded_attachment_list=downloaded_list,
attachment=download_obj,
download_path=download_path,
)
else:
raise ValueError("Unknown object type provided {type}".format(type=type(download_obj)))
downloaded_list.add(downloaded_file)

def _print_download_msg(self, msg: str, error: bool = False) -> None:
try:
Expand Down Expand Up @@ -313,7 +290,7 @@ def download_or_wait(
error = False
try:
self._wait_if_api_usage_limit()
self.download_from_sf(
self.download_file_from_sf(
downloaded_list=downloaded_list, download_obj=download_obj, download_path=download_path
)
except StopDownloadException:
Expand Down
61 changes: 34 additions & 27 deletions test/salesforce/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_downloader_download_will_gracefully_shutdown(shutdown_mock, submit_mock
shutdown_mock.assert_has_calls([call(wait=True), call(wait=True, cancel_futures=True)])


@patch.object(Downloader, "download_from_sf", side_effect=RuntimeError)
@patch.object(Downloader, "download_file_from_sf", side_effect=RuntimeError)
def test_downloader_download_will_return_download_stats(download_mock):
archivist_obj = ArchivistObject(data_dir="/fake/dir", obj_type="User")
link_list = ContentDocumentLinkList(data_dir=archivist_obj.obj_dir)
Expand Down Expand Up @@ -323,7 +323,7 @@ def test_downloader_download_will_return_download_stats(download_mock):


@patch("os.path.exists")
def test_downloader_download_from_sf_will_add_already_downloaded_object_to_list(
def test_downloader_download_file_from_sf_will_add_already_downloaded_object_to_list(
exist_mock,
):
exist_mock.return_value = True
Expand All @@ -337,25 +337,32 @@ def test_downloader_download_from_sf_will_add_already_downloaded_object_to_list(
version_number=1,
content_size=10,
)
attachment = Attachment(attachment_id="ID", parent_id="PID", content_size=10, name="Name")
downloaded_list = DownloadedList(data_dir=archivist_obj.obj_dir, file_name="downloaded_versions.csv")
sf_client = Mock()
downloader = Downloader(
sf_client=sf_client,
)
downloader.download_from_sf(downloaded_list=downloaded_list, download_obj=version, download_path="/fake/path")
exist_mock.assert_called_once()
assert len(downloaded_list) == 1
downloader.download_file_from_sf(downloaded_list=downloaded_list, download_obj=version, download_path="/fake/path")
downloader.download_file_from_sf(
downloaded_list=downloaded_list, download_obj=attachment, download_path="/fake/path"
)
assert exist_mock.call_count == 2
assert len(downloaded_list) == 2
assert downloaded_list.get(obj=version).id == version.id
assert downloaded_list.get(obj=attachment).id == attachment.id
assert sf_client.download_attachment.call_count == 0
assert sf_client.download_content_version.call_count == 0


def test_downloader_download_from_sf_will_copy_existing_file_to_new_path():
def test_downloader_download_file_from_sf_will_copy_existing_file_to_new_path():
with tempfile.TemporaryDirectory() as tmp_dir:
archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User")

already_downloaded_path = os.path.join(archivist_obj.obj_dir, "files", "file1.txt")
to_download_path = os.path.join(archivist_obj.obj_dir, "files", "file2.txt")
download_list_mock = MagicMock()
version1 = ContentVersion(
obj1 = ContentVersion(
version_id="CID",
document_id="DOC1",
checksum="c",
Expand All @@ -364,7 +371,7 @@ def test_downloader_download_from_sf_will_copy_existing_file_to_new_path():
version_number=1,
content_size=10,
)
version2 = ContentVersion(
obj2 = ContentVersion(
version_id="CID",
document_id="DOC2",
checksum="c",
Expand All @@ -374,38 +381,38 @@ def test_downloader_download_from_sf_will_copy_existing_file_to_new_path():
content_size=10,
)
download_list_mock.__iter__.return_value = [
(version1, already_downloaded_path),
(version2, to_download_path),
(obj1, already_downloaded_path),
(obj2, to_download_path),
]
os.makedirs(os.path.dirname(already_downloaded_path), exist_ok=True)
file_contents = b"test"
with open(already_downloaded_path, "wb") as downloaded_file:
downloaded_file.write(file_contents)

downloaded_version_list = DownloadedList(data_dir=archivist_obj.obj_dir, file_name="downloaded_versions.csv")
downloaded_version = DownloadedSalesforceObject(
obj_id=version1.id,
downloaded_list = DownloadedList(data_dir=archivist_obj.obj_dir, file_name="downloaded_versions.csv")
downloaded_obj = DownloadedSalesforceObject(
obj_id=obj1.id,
path=already_downloaded_path,
)
downloaded_version_list.add(downloaded_version)
downloaded_list.add(downloaded_obj)
sf_client = Mock()
downloader = Downloader(
sf_client=sf_client,
)
downloader.download_from_sf(
download_obj=version2,
downloader.download_file_from_sf(
download_obj=obj2,
download_path=to_download_path,
downloaded_list=downloaded_version_list,
downloaded_list=downloaded_list,
)
assert os.path.exists(to_download_path)
with open(to_download_path, "rb") as new_file:
assert new_file.read() == file_contents


def test_downloader_download_from_sf_will_download_version_from_salesforce():
def test_downloader_download_file_from_sf_will_download_version_from_salesforce():
with tempfile.TemporaryDirectory() as tmp_dir:
archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User")
version = ContentVersion(
obj = ContentVersion(
version_id="VID1",
document_id="DOC1",
checksum="c1",
Expand All @@ -424,8 +431,8 @@ def test_downloader_download_from_sf_will_download_version_from_salesforce():
sf_client=sf_client,
)
path = os.path.join(tmp_dir, "test.txt")
downloader.download_from_sf(
download_obj=version,
downloader.download_file_from_sf(
download_obj=obj,
download_path=path,
downloaded_list=downloaded_list,
)
Expand All @@ -434,10 +441,10 @@ def test_downloader_download_from_sf_will_download_version_from_salesforce():
assert file.read() == b"test"


def test_downloader_download_from_sf_will_download_attachment_from_salesforce():
def test_downloader_download_file_from_sf_will_download_attachment_from_salesforce():
with tempfile.TemporaryDirectory() as tmp_dir:
archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User")
attachment = Attachment(
archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="Attachment")
obj = Attachment(
attachment_id="ID",
parent_id="PID",
content_size=10,
Expand All @@ -453,8 +460,8 @@ def test_downloader_download_from_sf_will_download_attachment_from_salesforce():
sf_client=sf_client,
)
path = os.path.join(tmp_dir, "test.txt")
downloader.download_from_sf(
download_obj=attachment,
downloader.download_file_from_sf(
download_obj=obj,
download_path=path,
downloaded_list=downloaded_list,
)
Expand All @@ -477,7 +484,7 @@ def usage_side_effect(refresh: bool) -> ApiUsage:
sf_client.get_api_usage.side_effect = lambda refresh=False: usage_side_effect(refresh=refresh)
download_list_mock = MagicMock()
download_list_mock.__iter__.return_value = []
with patch.object(Downloader, "download_from_sf"):
with patch.object(Downloader, "download_file_from_sf"):
wait = 7
downloader = Downloader(
sf_client=sf_client,
Expand Down

0 comments on commit c78092f

Please sign in to comment.