From c3e29a89137dd386fda2413d1bcfa3542bb1684e Mon Sep 17 00:00:00 2001 From: John Wilkie <124276291+JBWilkie@users.noreply.github.com> Date: Thu, 21 Nov 2024 22:54:51 +0000 Subject: [PATCH] Allow download of frames extracted from videos & multi-slotted items (#972) --- darwin/dataset/download_manager.py | 51 ++++++++++-- darwin/dataset/remote_dataset.py | 4 +- darwin/exporter/formats/nifti.py | 2 +- darwin/utils/utils.py | 12 ++- tests/darwin/dataset/download_manager_test.py | 82 +++++++++++++++++-- 5 files changed, 129 insertions(+), 22 deletions(-) diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py index 2fc0d061e..b36f23f18 100644 --- a/darwin/dataset/download_manager.py +++ b/darwin/dataset/download_manager.py @@ -19,7 +19,11 @@ from rich.console import Console import darwin.datatypes as dt -from darwin.dataset.utils import sanitize_filename +from darwin.dataset.utils import ( + sanitize_filename, + SUPPORTED_IMAGE_EXTENSIONS, + SUPPORTED_VIDEO_EXTENSIONS, +) from darwin.datatypes import AnnotationFile from darwin.exceptions import MissingDependency from darwin.utils import ( @@ -301,12 +305,12 @@ def _download_all_slots_from_json_annotation( ) else: for upload in slot.source_files: - file_path = slot_path / sanitize_filename(upload["file_name"]) + file_path = slot_path / sanitize_filename(upload.file_name) generator.append( functools.partial( _download_image_with_trace, annotation, - upload["url"], + upload.url, file_path, api_key, ) @@ -357,8 +361,8 @@ def _download_single_slot_from_json_annotation( else: if len(slot.source_files) > 0: image = slot.source_files[0] - image_url = image["url"] - image_filename = image["file_name"] + image_url = image.url + image_filename = image.file_name if image_filename.endswith(".nii.gz"): suffix = ".nii.gz" stem = annotation.filename[: -len(suffix)] @@ -670,9 +674,44 @@ def _get_planned_image_paths( return [images_path / filename] else: for slot in annotation.slots: + if len(slot.source_files) > 1: + # Check that the item is either a DICOM series or a frame extracted from a video + is_dicom_series = all( + source_file.file_name.endswith(".dcm") # type: ignore + for source_file in slot.source_files + ) + is_extracted_frame = ( + len(slot.source_files) == 2 + and any( + source_file.file_name.endswith(ext) # type: ignore + for ext in SUPPORTED_VIDEO_EXTENSIONS + for source_file in slot.source_files + ) + and any( + source_file.file_name.endswith(ext) # type: ignore + for ext in SUPPORTED_IMAGE_EXTENSIONS + for source_file in slot.source_files + ) + ) + if is_extracted_frame: + # Select only the image if it's an extracted frame + frame_source_file = next( + source_file + for source_file in slot.source_files + if any( + source_file.file_name.endswith(ext) # type: ignore + for ext in SUPPORTED_IMAGE_EXTENSIONS + ) + ) + slot.source_files = [frame_source_file] + if not is_dicom_series and not is_extracted_frame: + raise ValueError( + "This slot contains data that is not a DICOM series or a frame extracted from a video" + ) + slot_name = Path(slot.name) for source_file in slot.source_files: - file_name = source_file.file_name + file_name = source_file.file_name # type: ignore if use_folders and annotation.remote_path != "/": file_paths.append( images_path diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 744b73860..fcfb3cf8d 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -168,9 +168,9 @@ def split_video_annotations(self, release_name: str = "latest") -> None: # When splitting into frames, we need to read each frame individually # Because we use the source name suffix, we need to adjust this to .png here current_stem = Path( - annotation["item"]["slots"][0]["source_files"][0]["file_name"] + annotation["item"]["slots"][0]["source_files"][0].file_name ).stem - annotation["item"]["slots"][0]["source_files"][0]["file_name"] = ( + annotation["item"]["slots"][0]["source_files"][0].file_name = ( current_stem + ".png" ) # We also need to account for the folder that this function creates diff --git a/darwin/exporter/formats/nifti.py b/darwin/exporter/formats/nifti.py index 9a861e8dc..0ee944207 100644 --- a/darwin/exporter/formats/nifti.py +++ b/darwin/exporter/formats/nifti.py @@ -223,7 +223,7 @@ def check_for_error_and_return_imageid( # Check if all item slots have the correct file-extension for slot in video_annotation.slots: for source_file in slot.source_files: - filename = Path(source_file["file_name"]) + filename = Path(source_file.file_name) if not ( filename.name.lower().endswith(".nii.gz") or filename.name.lower().endswith(".nii") diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index b7460e0ca..422e31bd8 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -612,9 +612,7 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: image_width=slot.width, image_height=slot.height, image_url=( - None - if len(slot.source_files or []) == 0 - else slot.source_files[0]["url"] + None if len(slot.source_files or []) == 0 else slot.source_files[0].url ), image_thumbnail_url=slot.thumbnail_url, workview_url=item_source.get("workview_url", None), @@ -630,12 +628,18 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: def _parse_darwin_slot(data: Dict[str, Any]) -> dt.Slot: + source_files_data = data.get("source_files", []) + source_files = [ + dt.SourceFile(file_name=source_file["file_name"], url=source_file.get("url")) + for source_file in source_files_data + ] + return dt.Slot( name=data["slot_name"], type=data["type"], width=data.get("width"), height=data.get("height"), - source_files=data.get("source_files", []), + source_files=source_files, thumbnail_url=data.get("thumbnail_url"), frame_count=data.get("frame_count"), frame_urls=data.get("frame_urls"), diff --git a/tests/darwin/dataset/download_manager_test.py b/tests/darwin/dataset/download_manager_test.py index ee0f5380c..db9e6f315 100644 --- a/tests/darwin/dataset/download_manager_test.py +++ b/tests/darwin/dataset/download_manager_test.py @@ -135,7 +135,9 @@ def test_multi_slot_without_folders_planned_image_paths(): Slot( name="slot1", type="image", - source_files=[SourceFile(file_name="source_name_1.jpg")], + source_files=[ + SourceFile(file_name="source_name_1.jpg"), + ], ), Slot( name="slot2", @@ -208,10 +210,10 @@ def test_single_slot_root_path_with_folders_planned_image_paths(): assert result == expected -def test_multiple_source_files_planned_image_paths(): +def test_dicom_series_planned_image_paths(): annotation = AnnotationFile( - path=Path("/local/annotations/image.json"), - filename="image.jpg", + path=Path("/local/annotations/series.json"), + filename="series.dcm", annotation_classes={ AnnotationClass(name="test_class", annotation_type="polygon") }, @@ -219,23 +221,85 @@ def test_multiple_source_files_planned_image_paths(): slots=[ Slot( name="slot1", - type="image", + type="dicom", source_files=[ - SourceFile(file_name="source_name_1.jpg"), - SourceFile(file_name="source_name_2.jpg"), + SourceFile(file_name="slice_1.dcm"), + SourceFile(file_name="slice_2.dcm"), + SourceFile(file_name="slice_3.dcm"), ], ) ], + remote_path="/", ) images_path = Path("/local/images") results = dm._get_planned_image_paths(annotation, images_path, use_folders=False) expected = [ - images_path / "image.jpg" / "slot1" / "source_name_1.jpg", - images_path / "image.jpg" / "slot1" / "source_name_2.jpg", + images_path / "series.dcm" / "slot1" / "slice_1.dcm", + images_path / "series.dcm" / "slot1" / "slice_2.dcm", + images_path / "series.dcm" / "slot1" / "slice_3.dcm", ] assert results == expected +def test_extracted_frames_planned_image_paths(): + annotation = AnnotationFile( + path=Path("/local/annotations/video.json"), + filename="video.mp4", + annotation_classes={ + AnnotationClass(name="test_class", annotation_type="polygon") + }, + annotations=[], + slots=[ + Slot( + name="0", + type="image", + source_files=[ + SourceFile(file_name="frame_0.jpg"), + SourceFile(file_name="video.mp4"), + ], + ), + ], + remote_path="/", + ) + images_path = Path("/local/images") + results = dm._get_planned_image_paths(annotation, images_path, use_folders=False) + expected = [ + images_path / "video.mp4" / "0" / "frame_0.jpg", + ] + assert results == expected + + +def test_multiple_source_files_raises_error(): + annotation = AnnotationFile( + path=Path("/local/annotations/image.json"), + filename="image.jpg", + annotation_classes={ + AnnotationClass(name="test_class", annotation_type="polygon") + }, + annotations=[], + slots=[ + Slot( + name="slot1", + type="image", + source_files=[ + SourceFile(file_name="image1.jpg"), + SourceFile(file_name="image2.jpg"), + ], + ) + ], + remote_path="/", + ) + images_path = Path("/local/images") + + with pytest.raises(ValueError) as exc_info: + dm._get_planned_image_paths(annotation, images_path, use_folders=False) + + assert ( + str(exc_info.value) + == "This slot contains data that is not a DICOM series or a frame extracted from a video" + ) + + def test__remove_empty_directories(tmp_path: Path) -> None: root_dir = tmp_path / "root" root_dir.mkdir()