Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAR-5293][External] Removed {item_path}/{item_name} validation on push #984

Merged
merged 1 commit into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 0 additions & 54 deletions darwin/dataset/upload_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,13 @@
Tuple,
Dict,
)
from rich.console import Console
import requests

from darwin.datatypes import PathLike, Slot, SourceFile
from darwin.doc_enum import DocEnum
from darwin.path_utils import construct_full_path
from darwin.utils import chunk
from darwin.utils.utils import is_image_extension_allowed_by_filename, SLOTS_GRID_MAP
from darwin.importer.importer import _console_theme

if TYPE_CHECKING:
from darwin.client import Client
Expand Down Expand Up @@ -418,7 +416,6 @@ def __init__(
self.local_files = local_files
self.dataset: RemoteDataset = dataset
self.errors: List[UploadRequestError] = []
self.skip_existing_full_remote_filepaths()
self.blocked_items, self.pending_items = self._request_upload(
handle_as_slices=handle_as_slices
)
Expand Down Expand Up @@ -466,57 +463,6 @@ def progress(self):
"""Current level of upload progress."""
return self._progress

def skip_existing_full_remote_filepaths(self) -> None:
"""
Checks if any items to be uploaded have duplicate {item_path}/{item_name} with
items already present in the remote dataset. Skip these files and display
a warning for each one.
"""
console = Console(theme=_console_theme())
full_remote_filepaths = [
Path(file.full_path) for file in self.dataset.fetch_remote_files()
]

multi_file_items_to_remove = []
local_files_to_remove = []

if self.multi_file_items:
for multi_file_item in self.multi_file_items:
if Path(multi_file_item.full_path) in full_remote_filepaths:
local_files_to_remove.extend(multi_file_item.files)
multi_file_items_to_remove.append(multi_file_item)
console.print(
f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the `{self.dataset.slug}` dataset. Skipping upload of item.",
style="warning",
)
if self.local_files:
for local_file in self.local_files:
if (
Path(local_file.full_path) in full_remote_filepaths
and local_file not in local_files_to_remove
):
local_files_to_remove.append(local_file)
console.print(
f"The remote filepath {local_file.full_path} already exists in the `{self.dataset.slug}` dataset. Skipping upload of item.",
style="warning",
)
self.local_files = [
local_file
for local_file in self.local_files
if local_file not in local_files_to_remove
]
if self.multi_file_items:
self.multi_file_items = [
multi_file_item
for multi_file_item in self.multi_file_items
if multi_file_item not in multi_file_items_to_remove
]

if not self.local_files and not self.multi_file_items:
raise ValueError(
"All items to be uploaded have paths that already exist in the remote dataset. No items to upload."
)

def prepare_upload(
self,
) -> Optional[Iterator[Callable[[Optional[ByteReadCallback]], None]]]:
Expand Down
79 changes: 1 addition & 78 deletions tests/darwin/dataset/upload_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ def test_request_upload_is_not_called_on_init(
dataset: RemoteDataset, request_upload_endpoint: str
):
with patch.object(dataset, "fetch_remote_files", return_value=[]):
with patch.object(
UploadHandler, "skip_existing_full_remote_filepaths", return_value=[]
):
upload_handler = UploadHandler.build(dataset, [])
upload_handler = UploadHandler.build(dataset, [])

assert upload_handler.pending_count == 0
assert upload_handler.blocked_count == 0
Expand Down Expand Up @@ -521,77 +518,3 @@ def test_default_value_when_env_var_is_not_integer(self, mock: MagicMock):
def test_value_specified_by_env_var(self, mock: MagicMock):
assert _upload_chunk_size() == 123
mock.assert_called_once_with("DARWIN_UPLOAD_CHUNK_SIZE")


def test_skip_existing_full_remote_filepaths_with_local_files():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_file_1.jpg"),
MagicMock(full_path="/existing_file_2.jpg"),
]
mock_dataset.slug = "test-dataset"

local_file_1 = MagicMock(full_path="/existing_file_1.jpg")
local_file_2 = MagicMock(full_path="/new_file.jpg")

with patch("darwin.dataset.upload_manager.Console.print") as mock_print:
upload_handler = UploadHandlerV2(mock_dataset, [local_file_1, local_file_2], [])

assert local_file_1 not in upload_handler.local_files
assert local_file_2 in upload_handler.local_files

mock_print.assert_any_call(
"The remote filepath /existing_file_1.jpg already exists in the `test-dataset` dataset. Skipping upload of item.",
style="warning",
)


def test_skip_existing_full_remote_filepaths_with_multi_file_items():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_multi_file_item.jpg"),
]
mock_dataset.slug = "test-dataset"

multi_file_item_1 = MagicMock(
full_path="/existing_multi_file_item.jpg", files=[MagicMock()]
)
multi_file_item_2 = MagicMock(
full_path="/new_multi_file_item.jpg", files=[MagicMock()]
)

with patch("darwin.dataset.upload_manager.Console.print") as mock_print:
upload_handler = UploadHandlerV2(
mock_dataset, [], [multi_file_item_1, multi_file_item_2]
)

assert multi_file_item_1 not in upload_handler.multi_file_items
assert multi_file_item_2 in upload_handler.multi_file_items

# Verify that the correct warning was printed
mock_print.assert_any_call(
"The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the `test-dataset` dataset. Skipping upload of item.",
style="warning",
)


def test_skip_existing_full_remote_filepaths_raises_if_no_files_left():
mock_dataset = MagicMock()
mock_dataset.fetch_remote_files.return_value = [
MagicMock(full_path="/existing_multi_file_item_1.jpg"),
MagicMock(full_path="/existing_multi_file_item_2.jpg"),
]
mock_dataset.slug = "test-dataset"

multi_file_item_1 = MagicMock(
full_path="/existing_multi_file_item_1.jpg", files=[MagicMock()]
)
multi_file_item_2 = MagicMock(
full_path="/existing_multi_file_item_2.jpg", files=[MagicMock()]
)

with pytest.raises(
ValueError,
match="All items to be uploaded have paths that already exist in the remote dataset. No items to upload.",
):
UploadHandlerV2(mock_dataset, [], [multi_file_item_1, multi_file_item_2])
Loading