From c3e29a89137dd386fda2413d1bcfa3542bb1684e Mon Sep 17 00:00:00 2001
From: John Wilkie <124276291+JBWilkie@users.noreply.github.com>
Date: Thu, 21 Nov 2024 22:54:51 +0000
Subject: [PATCH] Allow download of frames extracted from videos &
 multi-slotted items (#972)

---
 darwin/dataset/download_manager.py            | 51 ++++++++++--
 darwin/dataset/remote_dataset.py              |  4 +-
 darwin/exporter/formats/nifti.py              |  2 +-
 darwin/utils/utils.py                         | 12 ++-
 tests/darwin/dataset/download_manager_test.py | 82 +++++++++++++++++--
 5 files changed, 129 insertions(+), 22 deletions(-)

diff --git a/darwin/dataset/download_manager.py b/darwin/dataset/download_manager.py
index 2fc0d061e..b36f23f18 100644
--- a/darwin/dataset/download_manager.py
+++ b/darwin/dataset/download_manager.py
@@ -19,7 +19,11 @@
 from rich.console import Console
 
 import darwin.datatypes as dt
-from darwin.dataset.utils import sanitize_filename
+from darwin.dataset.utils import (
+    sanitize_filename,
+    SUPPORTED_IMAGE_EXTENSIONS,
+    SUPPORTED_VIDEO_EXTENSIONS,
+)
 from darwin.datatypes import AnnotationFile
 from darwin.exceptions import MissingDependency
 from darwin.utils import (
@@ -301,12 +305,12 @@ def _download_all_slots_from_json_annotation(
                     )
         else:
             for upload in slot.source_files:
-                file_path = slot_path / sanitize_filename(upload["file_name"])
+                file_path = slot_path / sanitize_filename(upload.file_name)
                 generator.append(
                     functools.partial(
                         _download_image_with_trace,
                         annotation,
-                        upload["url"],
+                        upload.url,
                         file_path,
                         api_key,
                     )
@@ -357,8 +361,8 @@ def _download_single_slot_from_json_annotation(
     else:
         if len(slot.source_files) > 0:
             image = slot.source_files[0]
-            image_url = image["url"]
-            image_filename = image["file_name"]
+            image_url = image.url
+            image_filename = image.file_name
             if image_filename.endswith(".nii.gz"):
                 suffix = ".nii.gz"
                 stem = annotation.filename[: -len(suffix)]
@@ -670,9 +674,44 @@ def _get_planned_image_paths(
             return [images_path / filename]
     else:
         for slot in annotation.slots:
+            if len(slot.source_files) > 1:
+                # Check that the item is either a DICOM series or a frame extracted from a video
+                is_dicom_series = all(
+                    source_file.file_name.endswith(".dcm")  # type: ignore
+                    for source_file in slot.source_files
+                )
+                is_extracted_frame = (
+                    len(slot.source_files) == 2
+                    and any(
+                        source_file.file_name.endswith(ext)  # type: ignore
+                        for ext in SUPPORTED_VIDEO_EXTENSIONS
+                        for source_file in slot.source_files
+                    )
+                    and any(
+                        source_file.file_name.endswith(ext)  # type: ignore
+                        for ext in SUPPORTED_IMAGE_EXTENSIONS
+                        for source_file in slot.source_files
+                    )
+                )
+                if is_extracted_frame:
+                    # Select only the image if it's an extracted frame
+                    frame_source_file = next(
+                        source_file
+                        for source_file in slot.source_files
+                        if any(
+                            source_file.file_name.endswith(ext)  # type: ignore
+                            for ext in SUPPORTED_IMAGE_EXTENSIONS
+                        )
+                    )
+                    slot.source_files = [frame_source_file]
+                if not is_dicom_series and not is_extracted_frame:
+                    raise ValueError(
+                        "This slot contains data that is not a DICOM series or a frame extracted from a video"
+                    )
+
             slot_name = Path(slot.name)
             for source_file in slot.source_files:
-                file_name = source_file.file_name
+                file_name = source_file.file_name  # type: ignore
                 if use_folders and annotation.remote_path != "/":
                     file_paths.append(
                         images_path
diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py
index 744b73860..fcfb3cf8d 100644
--- a/darwin/dataset/remote_dataset.py
+++ b/darwin/dataset/remote_dataset.py
@@ -168,9 +168,9 @@ def split_video_annotations(self, release_name: str = "latest") -> None:
                 # When splitting into frames, we need to read each frame individually
                 # Because we use the source name suffix, we need to adjust this to .png here
                 current_stem = Path(
-                    annotation["item"]["slots"][0]["source_files"][0]["file_name"]
+                    annotation["item"]["slots"][0]["source_files"][0].file_name
                 ).stem
-                annotation["item"]["slots"][0]["source_files"][0]["file_name"] = (
+                annotation["item"]["slots"][0]["source_files"][0].file_name = (
                     current_stem + ".png"
                 )
                 # We also need to account for the folder that this function creates
diff --git a/darwin/exporter/formats/nifti.py b/darwin/exporter/formats/nifti.py
index 9a861e8dc..0ee944207 100644
--- a/darwin/exporter/formats/nifti.py
+++ b/darwin/exporter/formats/nifti.py
@@ -223,7 +223,7 @@ def check_for_error_and_return_imageid(
     # Check if all item slots have the correct file-extension
     for slot in video_annotation.slots:
         for source_file in slot.source_files:
-            filename = Path(source_file["file_name"])
+            filename = Path(source_file.file_name)
             if not (
                 filename.name.lower().endswith(".nii.gz")
                 or filename.name.lower().endswith(".nii")
diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py
index b7460e0ca..422e31bd8 100644
--- a/darwin/utils/utils.py
+++ b/darwin/utils/utils.py
@@ -612,9 +612,7 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
             image_width=slot.width,
             image_height=slot.height,
             image_url=(
-                None
-                if len(slot.source_files or []) == 0
-                else slot.source_files[0]["url"]
+                None if len(slot.source_files or []) == 0 else slot.source_files[0].url
             ),
             image_thumbnail_url=slot.thumbnail_url,
             workview_url=item_source.get("workview_url", None),
@@ -630,12 +628,18 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
 
 
 def _parse_darwin_slot(data: Dict[str, Any]) -> dt.Slot:
+    source_files_data = data.get("source_files", [])
+    source_files = [
+        dt.SourceFile(file_name=source_file["file_name"], url=source_file.get("url"))
+        for source_file in source_files_data
+    ]
+
     return dt.Slot(
         name=data["slot_name"],
         type=data["type"],
         width=data.get("width"),
         height=data.get("height"),
-        source_files=data.get("source_files", []),
+        source_files=source_files,
         thumbnail_url=data.get("thumbnail_url"),
         frame_count=data.get("frame_count"),
         frame_urls=data.get("frame_urls"),
diff --git a/tests/darwin/dataset/download_manager_test.py b/tests/darwin/dataset/download_manager_test.py
index ee0f5380c..db9e6f315 100644
--- a/tests/darwin/dataset/download_manager_test.py
+++ b/tests/darwin/dataset/download_manager_test.py
@@ -135,7 +135,9 @@ def test_multi_slot_without_folders_planned_image_paths():
             Slot(
                 name="slot1",
                 type="image",
-                source_files=[SourceFile(file_name="source_name_1.jpg")],
+                source_files=[
+                    SourceFile(file_name="source_name_1.jpg"),
+                ],
             ),
             Slot(
                 name="slot2",
@@ -208,10 +210,10 @@ def test_single_slot_root_path_with_folders_planned_image_paths():
     assert result == expected
 
 
-def test_multiple_source_files_planned_image_paths():
+def test_dicom_series_planned_image_paths():
     annotation = AnnotationFile(
-        path=Path("/local/annotations/image.json"),
-        filename="image.jpg",
+        path=Path("/local/annotations/series.json"),
+        filename="series.dcm",
         annotation_classes={
             AnnotationClass(name="test_class", annotation_type="polygon")
         },
@@ -219,23 +221,85 @@ def test_multiple_source_files_planned_image_paths():
         slots=[
             Slot(
                 name="slot1",
-                type="image",
+                type="dicom",
                 source_files=[
-                    SourceFile(file_name="source_name_1.jpg"),
-                    SourceFile(file_name="source_name_2.jpg"),
+                    SourceFile(file_name="slice_1.dcm"),
+                    SourceFile(file_name="slice_2.dcm"),
+                    SourceFile(file_name="slice_3.dcm"),
                 ],
             )
         ],
+        remote_path="/",
     )
     images_path = Path("/local/images")
     results = dm._get_planned_image_paths(annotation, images_path, use_folders=False)
     expected = [
-        images_path / "image.jpg" / "slot1" / "source_name_1.jpg",
-        images_path / "image.jpg" / "slot1" / "source_name_2.jpg",
+        images_path / "series.dcm" / "slot1" / "slice_1.dcm",
+        images_path / "series.dcm" / "slot1" / "slice_2.dcm",
+        images_path / "series.dcm" / "slot1" / "slice_3.dcm",
     ]
     assert results == expected
 
 
+def test_extracted_frames_planned_image_paths():
+    annotation = AnnotationFile(
+        path=Path("/local/annotations/video.json"),
+        filename="video.mp4",
+        annotation_classes={
+            AnnotationClass(name="test_class", annotation_type="polygon")
+        },
+        annotations=[],
+        slots=[
+            Slot(
+                name="0",
+                type="image",
+                source_files=[
+                    SourceFile(file_name="frame_0.jpg"),
+                    SourceFile(file_name="video.mp4"),
+                ],
+            ),
+        ],
+        remote_path="/",
+    )
+    images_path = Path("/local/images")
+    results = dm._get_planned_image_paths(annotation, images_path, use_folders=False)
+    expected = [
+        images_path / "video.mp4" / "0" / "frame_0.jpg",
+    ]
+    assert results == expected
+
+
+def test_multiple_source_files_raises_error():
+    annotation = AnnotationFile(
+        path=Path("/local/annotations/image.json"),
+        filename="image.jpg",
+        annotation_classes={
+            AnnotationClass(name="test_class", annotation_type="polygon")
+        },
+        annotations=[],
+        slots=[
+            Slot(
+                name="slot1",
+                type="image",
+                source_files=[
+                    SourceFile(file_name="image1.jpg"),
+                    SourceFile(file_name="image2.jpg"),
+                ],
+            )
+        ],
+        remote_path="/",
+    )
+    images_path = Path("/local/images")
+
+    with pytest.raises(ValueError) as exc_info:
+        dm._get_planned_image_paths(annotation, images_path, use_folders=False)
+
+    assert (
+        str(exc_info.value)
+        == "This slot contains data that is not a DICOM series or a frame extracted from a video"
+    )
+
+
 def test__remove_empty_directories(tmp_path: Path) -> None:
     root_dir = tmp_path / "root"
     root_dir.mkdir()