diff --git a/.github/workflows/EVENT_merge_to_master.yml b/.github/workflows/EVENT_merge_to_master.yml index d49415792..bb97e1134 100644 --- a/.github/workflows/EVENT_merge_to_master.yml +++ b/.github/workflows/EVENT_merge_to_master.yml @@ -9,6 +9,7 @@ on: permissions: contents: read + id-token: write # Necessary for the generate documentation job concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -24,6 +25,9 @@ jobs: name: Documentation uses: ./.github/workflows/JOB_generate_documentation.yml secrets: inherit + permissions: + id-token: write + contents: read warn_on_fail: needs: [run_tests, documentation] diff --git a/.github/workflows/EVENT_tag.yml b/.github/workflows/EVENT_tag.yml index e0ac39292..bd4c16c0a 100644 --- a/.github/workflows/EVENT_tag.yml +++ b/.github/workflows/EVENT_tag.yml @@ -15,7 +15,7 @@ env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} permissions: - contents: write + contents: read jobs: create_release: diff --git a/.github/workflows/JOB_typecheck.yml b/.github/workflows/JOB_typecheck.yml index 1ca9d2348..11a5c77b6 100644 --- a/.github/workflows/JOB_typecheck.yml +++ b/.github/workflows/JOB_typecheck.yml @@ -36,5 +36,5 @@ jobs: - name: MyPy typecheck shell: bash run: | - pip install pydantic==2.8.2 + pip install pydantic bash ${{ github.workspace }}/deploy/format_lint.sh typecheck ${{ inputs.files }} diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py index 4eede09eb..752f37579 100644 --- a/darwin/dataset/download_manager.py +++ b/darwin/dataset/download_manager.py @@ -64,8 +64,9 @@ def download_all_images_from_annotations( Recreate folders video_frames : bool, default: False Pulls video frames images instead of video files - force_slots: bool + force_slots: bool, default: False Pulls all slots of items into deeper file structure ({prefix}/{item_name}/{slot_name}/{file_name}) + If False, all multi-slotted items and items with slots containing multiple source files will be downloaded as the deeper file structure Returns ------- @@ -90,7 +91,7 @@ def download_all_images_from_annotations( if is_file_extension_allowed(image.name) } - annotations_to_download_path = [] + annotations_to_download_path: List = [] for annotation_path in annotations_path.glob(f"*.{annotation_format}"): annotation = parse_darwin_json(annotation_path, count=0) if annotation is None: @@ -108,13 +109,14 @@ def download_all_images_from_annotations( ): continue - annotations_to_download_path.append(annotation_path) - if len(annotation.slots) > 1: - force_slots = True + if force_slots: + force_slots_for_item = True + else: + force_slots_for_item = len(annotation.slots) > 1 or any( + len(slot.source_files) > 1 for slot in annotation.slots + ) - for slot in annotation.slots: - if len(slot.source_files) > 1: - force_slots = True + annotations_to_download_path.append((annotation_path, force_slots_for_item)) if remove_extra: annotations = ( @@ -138,7 +140,7 @@ def download_all_images_from_annotations( # Create the generator with the partial functions download_functions: List = [] - for annotation_path in annotations_to_download_path: + for annotation_path, force_slots in annotations_to_download_path: file_download_functions = lazy_download_image_from_annotation( api_key, annotation_path, diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index fb2790bf9..c974ace0e 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -15,7 +15,10 @@ from darwin.client import Client from darwin.config import Config from darwin.dataset import RemoteDataset -from darwin.dataset.download_manager import _download_image_from_json_annotation +from darwin.dataset.download_manager import ( + _download_image_from_json_annotation, + download_all_images_from_annotations, +) from darwin.dataset.release import Release, ReleaseStatus from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2 from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2 @@ -25,6 +28,12 @@ from tests.fixtures import * +@pytest.fixture +def mock_is_file_extension_allowed(): + with patch("darwin.dataset.download_manager.is_file_extension_allowed") as mock: + yield mock + + @pytest.fixture def annotation_name() -> str: return "test_video.json" @@ -1373,3 +1382,71 @@ def test_omits_unavailable_releases_when_retry_is_false( releases = remote_dataset.get_releases(include_unavailable=False) assert len(releases) == 1 assert isinstance(releases[0], Release) + + +def test_force_slots_true(mock_is_file_extension_allowed): + mock_is_file_extension_allowed.return_value = True + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile("tests/data.zip") as zfile: + zfile.extractall(tmp_dir) + annotations_path = Path(tmp_dir) / "v7-darwin-json-v2/force_slots" + images_path = Path("images") + generator, count = download_all_images_from_annotations( + api_key="api_key", + annotations_path=annotations_path, + images_path=images_path, + force_slots=True, + force_replace=True, + ) + + download_funcs = list(generator()) + planned_paths = [download_funcs[i].args[2] for i in range(count)] + expected_paths = [ + Path("images/dir1/single_slotted_video_flat/0/mini_uct.mp4"), + Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_1.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_2.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_1.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_2.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_3.dcm"), + Path("images/dir1/single_slotted_image_flat/0/001.png"), + Path("images/dir1/multi_slotted_item_flat/0/000000580654.jpg"), + Path("images/dir1/multi_slotted_item_flat/1/000000580676.jpg"), + Path("images/dir1/multi_slotted_item_flat/2/000000580703.jpg"), + ] + assert count == 10 + for expected_path in expected_paths: + assert expected_path in planned_paths + + +def test_force_slots_false(mock_is_file_extension_allowed): + mock_is_file_extension_allowed.return_value = True + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile("tests/data.zip") as zfile: + zfile.extractall(tmp_dir) + annotations_path = Path(tmp_dir) / "v7-darwin-json-v2/force_slots" + images_path = Path("images") + generator, count = download_all_images_from_annotations( + api_key="api_key", + annotations_path=annotations_path, + images_path=images_path, + force_slots=False, + force_replace=True, + ) + + download_funcs = list(generator()) + planned_paths = [download_funcs[i].args[2] for i in range(count)] + expected_paths = [ + Path("images/dir1/single_slotted_video_flat.mp4"), + Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_1.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/0.1/slice_2.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_1.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_2.dcm"), + Path("images/dir1/multiple_slots_multiple_source_files/1.1/slice_3.dcm"), + Path("images/dir1/single_slotted_image_flat.png"), + Path("images/dir1/multi_slotted_item_flat/0/000000580654.jpg"), + Path("images/dir1/multi_slotted_item_flat/1/000000580676.jpg"), + Path("images/dir1/multi_slotted_item_flat/2/000000580703.jpg"), + ] + assert count == 10 + for expected_path in expected_paths: + assert expected_path in planned_paths diff --git a/tests/data.zip b/tests/data.zip index 0a675941a..fda8c0e0b 100644 Binary files a/tests/data.zip and b/tests/data.zip differ