From 240b8577fa5f6776b4ea05e29de3e0fc56196408 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Wed, 18 Dec 2024 15:52:37 +0000 Subject: [PATCH] Removed {item_path}/{item_name} validation on --- darwin/dataset/upload_manager.py | 54 -------------- tests/darwin/dataset/upload_manager_test.py | 79 +-------------------- 2 files changed, 1 insertion(+), 132 deletions(-) diff --git a/darwin/dataset/upload_manager.py b/darwin/dataset/upload_manager.py index cddc42658..480cc57b2 100644 --- a/darwin/dataset/upload_manager.py +++ b/darwin/dataset/upload_manager.py @@ -19,7 +19,6 @@ Tuple, Dict, ) -from rich.console import Console import requests from darwin.datatypes import PathLike, Slot, SourceFile @@ -27,7 +26,6 @@ from darwin.path_utils import construct_full_path from darwin.utils import chunk from darwin.utils.utils import is_image_extension_allowed_by_filename, SLOTS_GRID_MAP -from darwin.importer.importer import _console_theme if TYPE_CHECKING: from darwin.client import Client @@ -418,7 +416,6 @@ def __init__( self.local_files = local_files self.dataset: RemoteDataset = dataset self.errors: List[UploadRequestError] = [] - self.skip_existing_full_remote_filepaths() self.blocked_items, self.pending_items = self._request_upload( handle_as_slices=handle_as_slices ) @@ -466,57 +463,6 @@ def progress(self): """Current level of upload progress.""" return self._progress - def skip_existing_full_remote_filepaths(self) -> None: - """ - Checks if any items to be uploaded have duplicate {item_path}/{item_name} with - items already present in the remote dataset. Skip these files and display - a warning for each one. - """ - console = Console(theme=_console_theme()) - full_remote_filepaths = [ - Path(file.full_path) for file in self.dataset.fetch_remote_files() - ] - - multi_file_items_to_remove = [] - local_files_to_remove = [] - - if self.multi_file_items: - for multi_file_item in self.multi_file_items: - if Path(multi_file_item.full_path) in full_remote_filepaths: - local_files_to_remove.extend(multi_file_item.files) - multi_file_items_to_remove.append(multi_file_item) - console.print( - f"The remote filepath {multi_file_item.full_path} is already occupied by a dataset item in the `{self.dataset.slug}` dataset. Skipping upload of item.", - style="warning", - ) - if self.local_files: - for local_file in self.local_files: - if ( - Path(local_file.full_path) in full_remote_filepaths - and local_file not in local_files_to_remove - ): - local_files_to_remove.append(local_file) - console.print( - f"The remote filepath {local_file.full_path} already exists in the `{self.dataset.slug}` dataset. Skipping upload of item.", - style="warning", - ) - self.local_files = [ - local_file - for local_file in self.local_files - if local_file not in local_files_to_remove - ] - if self.multi_file_items: - self.multi_file_items = [ - multi_file_item - for multi_file_item in self.multi_file_items - if multi_file_item not in multi_file_items_to_remove - ] - - if not self.local_files and not self.multi_file_items: - raise ValueError( - "All items to be uploaded have paths that already exist in the remote dataset. No items to upload." - ) - def prepare_upload( self, ) -> Optional[Iterator[Callable[[Optional[ByteReadCallback]], None]]]: diff --git a/tests/darwin/dataset/upload_manager_test.py b/tests/darwin/dataset/upload_manager_test.py index 9834e48f9..7e01c4251 100644 --- a/tests/darwin/dataset/upload_manager_test.py +++ b/tests/darwin/dataset/upload_manager_test.py @@ -65,10 +65,7 @@ def test_request_upload_is_not_called_on_init( dataset: RemoteDataset, request_upload_endpoint: str ): with patch.object(dataset, "fetch_remote_files", return_value=[]): - with patch.object( - UploadHandler, "skip_existing_full_remote_filepaths", return_value=[] - ): - upload_handler = UploadHandler.build(dataset, []) + upload_handler = UploadHandler.build(dataset, []) assert upload_handler.pending_count == 0 assert upload_handler.blocked_count == 0 @@ -521,77 +518,3 @@ def test_default_value_when_env_var_is_not_integer(self, mock: MagicMock): def test_value_specified_by_env_var(self, mock: MagicMock): assert _upload_chunk_size() == 123 mock.assert_called_once_with("DARWIN_UPLOAD_CHUNK_SIZE") - - -def test_skip_existing_full_remote_filepaths_with_local_files(): - mock_dataset = MagicMock() - mock_dataset.fetch_remote_files.return_value = [ - MagicMock(full_path="/existing_file_1.jpg"), - MagicMock(full_path="/existing_file_2.jpg"), - ] - mock_dataset.slug = "test-dataset" - - local_file_1 = MagicMock(full_path="/existing_file_1.jpg") - local_file_2 = MagicMock(full_path="/new_file.jpg") - - with patch("darwin.dataset.upload_manager.Console.print") as mock_print: - upload_handler = UploadHandlerV2(mock_dataset, [local_file_1, local_file_2], []) - - assert local_file_1 not in upload_handler.local_files - assert local_file_2 in upload_handler.local_files - - mock_print.assert_any_call( - "The remote filepath /existing_file_1.jpg already exists in the `test-dataset` dataset. Skipping upload of item.", - style="warning", - ) - - -def test_skip_existing_full_remote_filepaths_with_multi_file_items(): - mock_dataset = MagicMock() - mock_dataset.fetch_remote_files.return_value = [ - MagicMock(full_path="/existing_multi_file_item.jpg"), - ] - mock_dataset.slug = "test-dataset" - - multi_file_item_1 = MagicMock( - full_path="/existing_multi_file_item.jpg", files=[MagicMock()] - ) - multi_file_item_2 = MagicMock( - full_path="/new_multi_file_item.jpg", files=[MagicMock()] - ) - - with patch("darwin.dataset.upload_manager.Console.print") as mock_print: - upload_handler = UploadHandlerV2( - mock_dataset, [], [multi_file_item_1, multi_file_item_2] - ) - - assert multi_file_item_1 not in upload_handler.multi_file_items - assert multi_file_item_2 in upload_handler.multi_file_items - - # Verify that the correct warning was printed - mock_print.assert_any_call( - "The remote filepath /existing_multi_file_item.jpg is already occupied by a dataset item in the `test-dataset` dataset. Skipping upload of item.", - style="warning", - ) - - -def test_skip_existing_full_remote_filepaths_raises_if_no_files_left(): - mock_dataset = MagicMock() - mock_dataset.fetch_remote_files.return_value = [ - MagicMock(full_path="/existing_multi_file_item_1.jpg"), - MagicMock(full_path="/existing_multi_file_item_2.jpg"), - ] - mock_dataset.slug = "test-dataset" - - multi_file_item_1 = MagicMock( - full_path="/existing_multi_file_item_1.jpg", files=[MagicMock()] - ) - multi_file_item_2 = MagicMock( - full_path="/existing_multi_file_item_2.jpg", files=[MagicMock()] - ) - - with pytest.raises( - ValueError, - match="All items to be uploaded have paths that already exist in the remote dataset. No items to upload.", - ): - UploadHandlerV2(mock_dataset, [], [multi_file_item_1, multi_file_item_2])