diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index 9b7cc31f1..61a952d74 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -71,7 +71,9 @@ def __init__( self.original_images_path: Optional[List[Path]] = None self.original_annotations_path: Optional[List[Path]] = None - release_path, annotations_dir, images_dir = self._initial_setup(dataset_path, release_name) + release_path, annotations_dir, images_dir = self._initial_setup( + dataset_path, release_name + ) self._validate_inputs(partition, split_type, annotation_type) # Get the list of classes @@ -79,12 +81,28 @@ def __init__( # We fetch bounding_boxes annotations from selected polygons as well if self.annotation_type == "bounding_boxes": annotation_types.append("polygon") - self.classes = get_classes(self.dataset_path, release_name, annotation_type=annotation_types, remove_background=True) + self.classes = get_classes( + self.dataset_path, + release_name, + annotation_type=annotation_types, + remove_background=True, + ) self.num_classes = len(self.classes) - self._setup_annotations_and_images(release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type) + self._setup_annotations_and_images( + release_path, + annotations_dir, + images_dir, + annotation_type, + split, + partition, + split_type, + ) if len(self.images_path) == 0: - raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}") + raise ValueError( + f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", + f" in {images_dir}", + ) assert len(self.images_path) == len(self.annotations_path) @@ -94,10 +112,23 @@ def _validate_inputs(self, partition, split_type, annotation_type): if split_type not in ["random", "stratified"]: raise ValueError("split_type should be either 'random', 'stratified'") if annotation_type not in ["tag", "polygon", "bounding_box"]: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - def _setup_annotations_and_images(self, release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type): - stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type) + def _setup_annotations_and_images( + self, + release_path, + annotations_dir, + images_dir, + annotation_type, + split, + partition, + split_type, + ): + stems = build_stems( + release_path, annotations_dir, annotation_type, split, partition, split_type + ) for stem in stems: annotation_path = annotations_dir / f"{stem}.json" images = [] @@ -110,9 +141,13 @@ def _setup_annotations_and_images(self, release_path, annotations_dir, images_di if image_path.exists(): images.append(image_path) if len(images) < 1: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") + raise ValueError( + f"Annotation ({annotation_path}) does not have a corresponding image" + ) if len(images) > 1: - raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.") + raise ValueError( + f"Image ({stem}) is present with multiple extensions. This is forbidden." + ) self.images_path.append(images[0]) self.annotations_path.append(annotation_path) @@ -173,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]: parsed = parse_darwin_json(self.annotations_path[index], index) return parsed.image_height, parsed.image_width - def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset": + def extend( + self, dataset: "LocalDataset", extend_classes: bool = False + ) -> "LocalDataset": """ Extends the current dataset with another one. @@ -267,7 +304,12 @@ def parse_json(self, index: int) -> Dict[str, Any]: # Filter out unused classes and annotations of a different type if self.classes is not None: - annotations = [a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)] + annotations = [ + a + for a in annotations + if a.annotation_class.name in self.classes + and self.annotation_type_supported(a) + ] return { "image_id": index, "image_path": str(self.images_path[index]), @@ -282,14 +324,21 @@ def annotation_type_supported(self, annotation) -> bool: return annotation_type == "tag" elif self.annotation_type == "bounding_box": is_bounding_box = annotation_type == "bounding_box" - is_supported_polygon = annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data + is_supported_polygon = ( + annotation_type in ["polygon", "complex_polygon"] + and "bounding_box" in annotation.data + ) return is_bounding_box or is_supported_polygon elif self.annotation_type == "polygon": return annotation_type in ["polygon", "complex_polygon"] else: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]: + def measure_mean_std( + self, multi_threaded: bool = True + ) -> Tuple[np.ndarray, np.ndarray]: """ Computes mean and std of trained images, given the train loader. @@ -312,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np. results = pool.map(self._return_mean, self.images_path) mean = np.sum(np.array(results), axis=0) / len(self.images_path) # Online image_classification deviation - results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path]) + results = pool.starmap( + self._return_std, [[item, mean] for item in self.images_path] + ) std_sum = np.sum(np.array([item[0] for item in results]), axis=0) total_pixel_count = np.sum(np.array([item[1] for item in results])) std = np.sqrt(std_sum / total_pixel_count) @@ -358,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray: @staticmethod def _return_mean(image_path: Path) -> np.ndarray: img = np.array(load_pil_image(image_path)) - mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]) + mean = np.array( + [np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])] + ) return mean / 255.0 # Loads an image with OpenCV and returns the channel wise std of the image. @staticmethod def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]: img = np.array(load_pil_image(image_path)) / 255.0 - m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]])) + m2 = np.square( + np.array( + [img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]] + ) + ) return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0 def __getitem__(self, index: int): @@ -435,7 +492,10 @@ def build_stems( """ if partition is None: - return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json"))) + return ( + str(e.relative_to(annotations_dir).parent / e.stem) + for e in sorted(annotations_dir.glob("**/*.json")) + ) if split_type == "random": split_filename = f"{split_type}_{partition}.txt" @@ -448,4 +508,7 @@ def build_stems( if split_path.is_file(): return (e.strip("\n\r") for e in split_path.open()) - raise FileNotFoundError("could not find a dataset partition. " "Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`") + raise FileNotFoundError( + "could not find a dataset partition. " + "Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`" + )