v7labs · JBWilkie · Oct 24, 2023 · Oct 5, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py
@@ -6,7 +6,12 @@
 from PIL import Image as PILImage
 
 from darwin.dataset.utils import get_classes, get_release_path, load_pil_image
-from darwin.utils import SUPPORTED_IMAGE_EXTENSIONS, parse_darwin_json
+from darwin.utils import (
+    SUPPORTED_IMAGE_EXTENSIONS,
+    get_image_path_from_stream,
+    parse_darwin_json,
+    stream_darwin_json,
+)
 
 
 class LocalDataset:
@@ -126,30 +131,18 @@ def _setup_annotations_and_images(
         partition,
         split_type,
     ):
-        stems = build_stems(
-            release_path, annotations_dir, annotation_type, split, partition, split_type
-        )
-        for stem in stems:
-            annotation_path = annotations_dir / f"{stem}.json"
-            images = []
-            for ext in SUPPORTED_IMAGE_EXTENSIONS:
-                image_path = images_dir / f"{stem}{ext}"
-                if image_path.exists():
-                    images.append(image_path)
-                    continue
-                image_path = images_dir / f"{stem}{ext.upper()}"
-                if image_path.exists():
-                    images.append(image_path)
-            if len(images) < 1:
+        # Find all the annotations and their corresponding images
+        for annotation_path in sorted(annotations_dir.glob("**/*.json")):
+            darwin_json = stream_darwin_json(annotation_path)
+            image_path = get_image_path_from_stream(darwin_json, images_dir)
+            if image_path.exists():
+                self.images_path.append(image_path)
+                self.annotations_path.append(annotation_path)
+                continue
+            else:
                 raise ValueError(
                     f"Annotation ({annotation_path}) does not have a corresponding image"
                 )
-            if len(images) > 1:
-                raise ValueError(
-                    f"Image ({stem}) is present with multiple extensions. This is forbidden."
-                )
-            self.images_path.append(images[0])
-            self.annotations_path.append(annotation_path)
 
     def _initial_setup(self, dataset_path, release_name):
         assert dataset_path is not None

diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py
@@ -17,9 +17,11 @@
     SUPPORTED_EXTENSIONS,
     SUPPORTED_VIDEO_EXTENSIONS,
     attempt_decode,
+    get_image_path_from_stream,
     is_unix_like_os,
     parse_darwin_json,
 )
+from darwin.utils.utils import stream_darwin_json
 
 # E.g.: {"partition" => {"class_name" => 123}}
 AnnotationDistribution = Dict[str, Counter]
@@ -569,33 +571,19 @@ def _map_annotations_to_images(
     images_paths = []
     annotations_paths = []
     invalid_annotation_paths = []
-    for stem in stems:
-        annotation_path = annotations_dir / f"{stem}.json"
-        images = []
-        for ext in SUPPORTED_EXTENSIONS:
-            image_path = images_dir / f"{stem}{ext}"
-            if image_path.exists():
-                images.append(image_path)
-                continue
-            image_path = images_dir / f"{stem}{ext.upper()}"
-            if image_path.exists():
-                images.append(image_path)
-
-        image_count = len(images)
-        if image_count != 1 and ignore_inconsistent_examples:
-            invalid_annotation_paths.append(annotation_path)
+    for annotation_path in annotations_dir.glob("**/*.json"):
+        darwin_json = stream_darwin_json(annotation_path)
+        image_path = get_image_path_from_stream(darwin_json, images_dir)
+        if image_path.exists():
+            images_paths.append(image_path)
+            annotations_paths.append(annotation_path)
             continue
-        elif image_count < 1:
-            raise ValueError(
-                f"Annotation ({annotation_path}) does not have a corresponding image"
-            )
-        elif image_count > 1:
-            raise ValueError(
-                f"Image ({stem}) is present with multiple extensions. This is forbidden."
-            )
-
-        images_paths.append(images[0])
-        annotations_paths.append(annotation_path)
+        else:
+            if ignore_inconsistent_examples:
+                invalid_annotation_paths.append(annotation_path)
+                continue
+            else:
+                raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
 
     return images_paths, annotations_paths, invalid_annotation_paths
 

diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py
@@ -20,9 +20,11 @@
 )
 
 import deprecation
+import json_stream
 import numpy as np
 import orjson as json
 import requests
+from json_stream.base import PersistentStreamingJSONObject
 from jsonschema import exceptions, validators
 from requests import Response, request
 from rich.progress import ProgressType, track
@@ -454,6 +456,45 @@ def parse_darwin_json(path: Path, count: Optional[int] = None) -> Optional[dt.An
         else:
             return _parse_darwin_image(path, data, count)
 
+def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject:
+    """
+    Returns a Darwin JSON file as a persistent stream. This allows for parsing large files without
+    loading them entirely into memory.
+
+    Parameters
+    ----------
+    path : Path
+        Path to the file to parse.
+
+    Returns
+    -------
+    PersistentStreamingJSONObject
+        A stream of the JSON file.
+    """
+
+    with path.open() as infile:
+        return json_stream.load(infile, persistent=True)
+
+def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, images_dir: Path) -> Path:
+    """
+    Returns the path to the image file associated with the given darwin json file (V1 or V2).
+
+    Parameters
+    ----------
+    darwin_json : PersistentStreamingJSONObject
+        A stream of the JSON file.
+    images_dir : Path
+        Path to the directory containing the images.
+
+    Returns
+    -------
+    Path
+        Path to the image file.
+    """
+    try:
+        return images_dir / (Path(darwin_json['item']['path'].lstrip('/\\'))) / Path(darwin_json['item']['name'])
+    except KeyError:
+        return images_dir / (Path(darwin_json['image']['path'].lstrip('/\\'))) / Path(darwin_json['image']['filename'])
 
 def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
     item = data["item"]