v7labs · ChristofferEdlund · Oct 20, 2023 · Sep 29, 2023 · Sep 29, 2023 · Sep 29, 2023
diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 
 import numpy as np
-import orjson as json
 from PIL import Image as PILImage
 
 from darwin.dataset.utils import get_classes, get_release_path, load_pil_image
@@ -64,20 +63,6 @@ def __init__(
         split_type: str = "random",
         release_name: Optional[str] = None,
     ):
-        assert dataset_path is not None
-        release_path = get_release_path(dataset_path, release_name)
-        annotations_dir = release_path / "annotations"
-        assert annotations_dir.exists()
-        images_dir = dataset_path / "images"
-        assert images_dir.exists()
-
-        if partition not in ["train", "val", "test", None]:
-            raise ValueError("partition should be either 'train', 'val', or 'test'")
-        if split_type not in ["random", "stratified"]:
-            raise ValueError("split_type should be either 'random', 'stratified'")
-        if annotation_type not in ["tag", "polygon", "bounding_box"]:
-            raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
-
         self.dataset_path = dataset_path
         self.annotation_type = annotation_type
         self.images_path: List[Path] = []
@@ -86,15 +71,64 @@ def __init__(
         self.original_images_path: Optional[List[Path]] = None
         self.original_annotations_path: Optional[List[Path]] = None
 
+        release_path, annotations_dir, images_dir = self._initial_setup(
+            dataset_path, release_name
+        )
+        self._validate_inputs(partition, split_type, annotation_type)
         # Get the list of classes
+
+        annotation_types = [self.annotation_type]
+        # We fetch bounding_boxes annotations from selected polygons as well
+        if self.annotation_type == "bounding_boxes":
+            annotation_types.append("polygon")
         self.classes = get_classes(
-            self.dataset_path, release_name, annotation_type=self.annotation_type, remove_background=True
+            self.dataset_path,
+            release_name,
+            annotation_type=annotation_types,
+            remove_background=True,
         )
         self.num_classes = len(self.classes)
+        self._setup_annotations_and_images(
+            release_path,
+            annotations_dir,
+            images_dir,
+            annotation_type,
+            split,
+            partition,
+            split_type,
+        )
+
+        if len(self.images_path) == 0:
+            raise ValueError(
+                f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file",
+                f" in {images_dir}",
+            )
+
+        assert len(self.images_path) == len(self.annotations_path)
 
-        stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type)
+    def _validate_inputs(self, partition, split_type, annotation_type):
+        if partition not in ["train", "val", "test", None]:
+            raise ValueError("partition should be either 'train', 'val', or 'test'")
+        if split_type not in ["random", "stratified"]:
+            raise ValueError("split_type should be either 'random', 'stratified'")
+        if annotation_type not in ["tag", "polygon", "bounding_box"]:
+            raise ValueError(
+                "annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
+            )
 
-        # Find all the annotations and their corresponding images
+    def _setup_annotations_and_images(
+        self,
+        release_path,
+        annotations_dir,
+        images_dir,
+        annotation_type,
+        split,
+        partition,
+        split_type,
+    ):
+        stems = build_stems(
+            release_path, annotations_dir, annotation_type, split, partition, split_type
+        )
         for stem in stems:
             annotation_path = annotations_dir / f"{stem}.json"
             images = []
@@ -107,16 +141,24 @@ def __init__(
                 if image_path.exists():
                     images.append(image_path)
             if len(images) < 1:
-                raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
+                raise ValueError(
+                    f"Annotation ({annotation_path}) does not have a corresponding image"
+                )
             if len(images) > 1:
-                raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.")
+                raise ValueError(
+                    f"Image ({stem}) is present with multiple extensions. This is forbidden."
+                )
             self.images_path.append(images[0])
             self.annotations_path.append(annotation_path)
 
-        if len(self.images_path) == 0:
-            raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}")
-
-        assert len(self.images_path) == len(self.annotations_path)
+    def _initial_setup(self, dataset_path, release_name):
+        assert dataset_path is not None
+        release_path = get_release_path(dataset_path, release_name)
+        annotations_dir = release_path / "annotations"
+        assert annotations_dir.exists()
+        images_dir = dataset_path / "images"
+        assert images_dir.exists()
+        return release_path, annotations_dir, images_dir
 
     def get_img_info(self, index: int) -> Dict[str, Any]:
         """
@@ -166,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]:
         parsed = parse_darwin_json(self.annotations_path[index], index)
         return parsed.image_height, parsed.image_width
 
-    def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset":
+    def extend(
+        self, dataset: "LocalDataset", extend_classes: bool = False
+    ) -> "LocalDataset":
         """
         Extends the current dataset with another one.
 
@@ -261,7 +305,10 @@ def parse_json(self, index: int) -> Dict[str, Any]:
         # Filter out unused classes and annotations of a different type
         if self.classes is not None:
             annotations = [
-                a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)
+                a
+                for a in annotations
+                if a.annotation_class.name in self.classes
+                and self.annotation_type_supported(a)
             ]
         return {
             "image_id": index,
@@ -278,15 +325,20 @@ def annotation_type_supported(self, annotation) -> bool:
         elif self.annotation_type == "bounding_box":
             is_bounding_box = annotation_type == "bounding_box"
             is_supported_polygon = (
-                annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data
+                annotation_type in ["polygon", "complex_polygon"]
+                and "bounding_box" in annotation.data
             )
             return is_bounding_box or is_supported_polygon
         elif self.annotation_type == "polygon":
             return annotation_type in ["polygon", "complex_polygon"]
         else:
-            raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
+            raise ValueError(
+                "annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
+            )
 
-    def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]:
+    def measure_mean_std(
+        self, multi_threaded: bool = True
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """
         Computes mean and std of trained images, given the train loader.
 
@@ -309,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.
                 results = pool.map(self._return_mean, self.images_path)
                 mean = np.sum(np.array(results), axis=0) / len(self.images_path)
                 # Online image_classification deviation
-                results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path])
+                results = pool.starmap(
+                    self._return_std, [[item, mean] for item in self.images_path]
+                )
                 std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
                 total_pixel_count = np.sum(np.array([item[1] for item in results]))
                 std = np.sqrt(std_sum / total_pixel_count)
@@ -355,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray:
     @staticmethod
     def _return_mean(image_path: Path) -> np.ndarray:
         img = np.array(load_pil_image(image_path))
-        mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])])
+        mean = np.array(
+            [np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]
+        )
         return mean / 255.0
 
     # Loads an image with OpenCV and returns the channel wise std of the image.
     @staticmethod
     def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]:
         img = np.array(load_pil_image(image_path)) / 255.0
-        m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
+        m2 = np.square(
+            np.array(
+                [img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]
+            )
+        )
         return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0
 
     def __getitem__(self, index: int):
@@ -432,7 +492,10 @@ def build_stems(
     """
 
     if partition is None:
-        return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json")))
+        return (
+            str(e.relative_to(annotations_dir).parent / e.stem)
+            for e in sorted(annotations_dir.glob("**/*.json"))
+        )
 
     if split_type == "random":
         split_filename = f"{split_type}_{partition}.txt"

diff --git a/darwin/dataset/split_manager.py b/darwin/dataset/split_manager.py
@@ -146,6 +146,10 @@ def split_dataset(
     train_size: int = dataset_size - val_size - test_size
     split_id = f"{train_size}_{val_size}_{test_size}"
 
+    assert train_size > 0, f"Found {train_size} train examples, we need at least 1"
+    assert val_size > 0, f"Found {val_size} train examples, we need at least 1"
+    assert test_size > 0, f"Found {test_size} train examples, we need at least 1"
+
     # Compute split id, a combination of val precentage, test percentage and split seed
     # The split id is used to create a folder with the same name in the "lists" folder
     if split_seed != 0:
@@ -228,7 +232,12 @@ def _stratified_split(
         return
 
     for stratified_type in stratified_types:
-        _, idx_to_classes = extract_classes(annotation_path, stratified_type)
+        if stratified_type == "bounding_box":
+            class_annotation_types = [stratified_type, "polygon"]
+        else:
+            class_annotation_types = stratified_type
+
+        _, idx_to_classes = extract_classes(annotation_path, class_annotation_types)
         if len(idx_to_classes) == 0:
             continue
 
@@ -252,13 +261,32 @@ def _stratified_split(
             else:
                 test_indices.append(idx)
 
-        _write_to_file(annotation_path, annotation_files, split[stratified_type]["train"], train_indices)
-        _write_to_file(annotation_path, annotation_files, split[stratified_type]["val"], val_indices)
-        _write_to_file(annotation_path, annotation_files, split[stratified_type]["test"], test_indices)
+        _write_to_file(
+            annotation_path,
+            annotation_files,
+            split[stratified_type]["train"],
+            train_indices,
+        )
+        _write_to_file(
+            annotation_path,
+            annotation_files,
+            split[stratified_type]["val"],
+            val_indices,
+        )
+        _write_to_file(
+            annotation_path,
+            annotation_files,
+            split[stratified_type]["test"],
+            test_indices,
+        )
 
 
 def _stratify_samples(
-    idx_to_classes: Dict[int, Set[str]], split_seed: int, train_size: int, val_size: int, test_size: int
+    idx_to_classes: Dict[int, Set[str]],
+    split_seed: int,
+    train_size: int,
+    val_size: int,
+    test_size: int,
 ) -> Tuple[List[int], List[int], List[int]]:
     """Splits the list of indices into train, val and test according to their labels (stratified)
 
@@ -292,8 +320,8 @@ def _stratify_samples(
     # Extract entries whose support set is 1 (it would make sklearn crash) and append the to train later
     unique_labels, count = np.unique(labels, return_counts=True)
     single_files = []
-    for l in unique_labels[count == 1]:
-        index = np.where(labels == l)[0][0]
+    for label in unique_labels[count == 1]:
+        index = np.where(labels == label)[0][0]
         single_files.append(file_indices[index])
         labels = np.delete(labels, index)
         file_indices = np.delete(file_indices, index)
@@ -330,7 +358,11 @@ def _stratify_samples(
     # Remove duplicates within the same set
     # NOTE: doing that earlier (e.g. in _remove_cross_contamination()) would produce mathematical
     # mistakes in the class balancing between validation and test sets.
-    return (list(set(X_train.astype(int))), list(set(X_val.astype(int))), list(set(X_test.astype(int))))
+    return (
+        list(set(X_train.astype(int))),
+        list(set(X_val.astype(int))),
+        list(set(X_test.astype(int))),
+    )
 
 
 def _remove_cross_contamination(
@@ -390,20 +422,33 @@ def _unique(array: np.ndarray) -> np.ndarray:
     return array[sorted(indexes)]
 
 
-def _write_to_file(annotation_path: Path, annotation_files: List[Path], file_path: Path, split_idx: Iterable) -> None:
+def _write_to_file(
+    annotation_path: Path,
+    annotation_files: List[Path],
+    file_path: Path,
+    split_idx: Iterable,
+) -> None:
     with open(str(file_path), "w") as f:
         for i in split_idx:
             # To deal with recursive search, we want to write the difference between the annotation path
             # and its parent, without the file extension
-            stem = str(annotation_files[i]).replace(f"{annotation_path}/", "").rsplit(".json", 1)[0]
+            stem = (
+                str(annotation_files[i])
+                .replace(f"{annotation_path}/", "")
+                .rsplit(".json", 1)[0]
+            )
             f.write(f"{stem}\n")
 
 
 def _validate_split(val_percentage: float, test_percentage: float) -> None:
     if val_percentage is None or not 0 < val_percentage < 1:
-        raise ValueError(f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1.")
+        raise ValueError(
+            f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1."
+        )
     if test_percentage is None or not 0 < test_percentage < 1:
-        raise ValueError(f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1.")
+        raise ValueError(
+            f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1."
+        )
     if val_percentage + test_percentage >= 1:
         raise ValueError(
             f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. "
@@ -412,18 +457,23 @@ def _validate_split(val_percentage: float, test_percentage: float) -> None:
 
 
 def _build_split(
-    split_path: Path, stratified_types: List[str], partitions: List[str] = ["train", "val", "test"]
+    split_path: Path,
+    stratified_types: List[str],
+    partitions: List[str] = ["train", "val", "test"],
 ) -> Split:
     split = Split()
 
-    split.random = {partition: split_path / f"random_{partition}.txt" for partition in partitions}
+    split.random = {
+        partition: split_path / f"random_{partition}.txt" for partition in partitions
+    }
     if len(stratified_types) == 0:
         return split
 
     stratified_dict: Dict[str, Dict[str, Path]] = {}
     for stratified_type in stratified_types:
         stratified_dict[stratified_type] = {
-            partition: split_path / f"stratified_{stratified_type}_{partition}.txt" for partition in partitions
+            partition: split_path / f"stratified_{stratified_type}_{partition}.txt"
+            for partition in partitions
         }
     split.stratified = stratified_dict
     return split