diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index d3749aa55..ffbe6488c 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -71,7 +71,9 @@ def __init__( self.original_images_path: Optional[List[Path]] = None self.original_annotations_path: Optional[List[Path]] = None - release_path, annotations_dir, images_dir = self._initial_setup(dataset_path, release_name) + release_path, annotations_dir, images_dir = self._initial_setup( + dataset_path, release_name + ) self._validate_inputs(partition, split_type, annotation_type) # Get the list of classes @@ -79,12 +81,28 @@ def __init__( # We fetch bounding_boxes annotations from selected polygons as well if self.annotation_type == "bounding_boxes": annotation_types.append("polygon") - self.classes = get_classes(self.dataset_path, release_name, annotation_type=annotation_types, remove_background=True) + self.classes = get_classes( + self.dataset_path, + release_name, + annotation_type=annotation_types, + remove_background=True, + ) self.num_classes = len(self.classes) - self._setup_annotations_and_images(release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type) + self._setup_annotations_and_images( + release_path, + annotations_dir, + images_dir, + annotation_type, + split, + partition, + split_type, + ) if len(self.images_path) == 0: - raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}") + raise ValueError( + f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", + f" in {images_dir}", + ) assert len(self.images_path) == len(self.annotations_path) @@ -94,22 +112,42 @@ def _validate_inputs(self, partition, split_type, annotation_type): if split_type not in ["random", "stratified"]: raise ValueError("split_type should be either 'random', 'stratified'") if annotation_type not in ["tag", "polygon", "bounding_box"]: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - def _setup_annotations_and_images(self, release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type): - stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type) + def _setup_annotations_and_images( + self, + release_path, + annotations_dir, + images_dir, + annotation_type, + split, + partition, + split_type, + ): + stems = build_stems( + release_path, annotations_dir, annotation_type, split, partition, split_type + ) for stem in stems: annotation_path = annotations_dir / f"{stem}.json" images = [ image_path for ext in SUPPORTED_IMAGE_EXTENSIONS - for image_path in [images_dir / f"{stem}{ext}", images_dir / f"{stem}{ext.upper()}"] + for image_path in [ + images_dir / f"{stem}{ext}", + images_dir / f"{stem}{ext.upper()}", + ] if image_path.exists() ] if len(images) < 1: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") + raise ValueError( + f"Annotation ({annotation_path}) does not have a corresponding image" + ) if len(images) > 1: - raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.") + raise ValueError( + f"Image ({stem}) is present with multiple extensions. This is forbidden." + ) self.images_path.append(images[0]) self.annotations_path.append(annotation_path) @@ -170,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]: parsed = parse_darwin_json(self.annotations_path[index], index) return parsed.image_height, parsed.image_width - def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset": + def extend( + self, dataset: "LocalDataset", extend_classes: bool = False + ) -> "LocalDataset": """ Extends the current dataset with another one. @@ -264,7 +304,12 @@ def parse_json(self, index: int) -> Dict[str, Any]: # Filter out unused classes and annotations of a different type if self.classes is not None: - annotations = [a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)] + annotations = [ + a + for a in annotations + if a.annotation_class.name in self.classes + and self.annotation_type_supported(a) + ] return { "image_id": index, "image_path": str(self.images_path[index]), @@ -279,14 +324,21 @@ def annotation_type_supported(self, annotation) -> bool: return annotation_type == "tag" elif self.annotation_type == "bounding_box": is_bounding_box = annotation_type == "bounding_box" - is_supported_polygon = annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data + is_supported_polygon = ( + annotation_type in ["polygon", "complex_polygon"] + and "bounding_box" in annotation.data + ) return is_bounding_box or is_supported_polygon elif self.annotation_type == "polygon": return annotation_type in ["polygon", "complex_polygon"] else: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]: + def measure_mean_std( + self, multi_threaded: bool = True + ) -> Tuple[np.ndarray, np.ndarray]: """ Computes mean and std of trained images, given the train loader. @@ -309,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np. results = pool.map(self._return_mean, self.images_path) mean = np.sum(np.array(results), axis=0) / len(self.images_path) # Online image_classification deviation - results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path]) + results = pool.starmap( + self._return_std, [[item, mean] for item in self.images_path] + ) std_sum = np.sum(np.array([item[0] for item in results]), axis=0) total_pixel_count = np.sum(np.array([item[1] for item in results])) std = np.sqrt(std_sum / total_pixel_count) @@ -355,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray: @staticmethod def _return_mean(image_path: Path) -> np.ndarray: img = np.array(load_pil_image(image_path)) - mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]) + mean = np.array( + [np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])] + ) return mean / 255.0 # Loads an image with OpenCV and returns the channel wise std of the image. @staticmethod def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]: img = np.array(load_pil_image(image_path)) / 255.0 - m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]])) + m2 = np.square( + np.array( + [img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]] + ) + ) return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0 def __getitem__(self, index: int): @@ -432,7 +492,10 @@ def build_stems( """ if partition is None: - return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json"))) + return ( + str(e.relative_to(annotations_dir).parent / e.stem) + for e in sorted(annotations_dir.glob("**/*.json")) + ) if split_type == "random": split_filename = f"{split_type}_{partition}.txt" @@ -445,4 +508,7 @@ def build_stems( if split_path.is_file(): return (e.strip("\n\r") for e in split_path.open()) - raise FileNotFoundError("could not find a dataset partition. " "Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`") + raise FileNotFoundError( + "could not find a dataset partition. " + "Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`" + ) diff --git a/darwin/dataset/split_manager.py b/darwin/dataset/split_manager.py index c9bac9a02..5561631f4 100644 --- a/darwin/dataset/split_manager.py +++ b/darwin/dataset/split_manager.py @@ -119,7 +119,9 @@ def split_dataset( try: import sklearn # noqa except ImportError: - raise ImportError("Darwin requires scikit-learn to split a dataset. Install it using: pip install scikit-learn") from None + raise ImportError( + "Darwin requires scikit-learn to split a dataset. Install it using: pip install scikit-learn" + ) from None _validate_split(val_percentage, test_percentage) @@ -259,13 +261,32 @@ def _stratified_split( else: test_indices.append(idx) - _write_to_file(annotation_path, annotation_files, split[stratified_type]["train"], train_indices) - _write_to_file(annotation_path, annotation_files, split[stratified_type]["val"], val_indices) - _write_to_file(annotation_path, annotation_files, split[stratified_type]["test"], test_indices) + _write_to_file( + annotation_path, + annotation_files, + split[stratified_type]["train"], + train_indices, + ) + _write_to_file( + annotation_path, + annotation_files, + split[stratified_type]["val"], + val_indices, + ) + _write_to_file( + annotation_path, + annotation_files, + split[stratified_type]["test"], + test_indices, + ) def _stratify_samples( - idx_to_classes: Dict[int, Set[str]], split_seed: int, train_size: int, val_size: int, test_size: int + idx_to_classes: Dict[int, Set[str]], + split_seed: int, + train_size: int, + val_size: int, + test_size: int, ) -> Tuple[List[int], List[int], List[int]]: """Splits the list of indices into train, val and test according to their labels (stratified) @@ -337,7 +358,11 @@ def _stratify_samples( # Remove duplicates within the same set # NOTE: doing that earlier (e.g. in _remove_cross_contamination()) would produce mathematical # mistakes in the class balancing between validation and test sets. - return (list(set(X_train.astype(int))), list(set(X_val.astype(int))), list(set(X_test.astype(int)))) + return ( + list(set(X_train.astype(int))), + list(set(X_val.astype(int))), + list(set(X_test.astype(int))), + ) def _remove_cross_contamination( @@ -397,35 +422,58 @@ def _unique(array: np.ndarray) -> np.ndarray: return array[sorted(indexes)] -def _write_to_file(annotation_path: Path, annotation_files: List[Path], file_path: Path, split_idx: Iterable) -> None: +def _write_to_file( + annotation_path: Path, + annotation_files: List[Path], + file_path: Path, + split_idx: Iterable, +) -> None: with open(str(file_path), "w") as f: for i in split_idx: # To deal with recursive search, we want to write the difference between the annotation path # and its parent, without the file extension - stem = str(annotation_files[i]).replace(f"{annotation_path}/", "").rsplit(".json", 1)[0] + stem = ( + str(annotation_files[i]) + .replace(f"{annotation_path}/", "") + .rsplit(".json", 1)[0] + ) f.write(f"{stem}\n") def _validate_split(val_percentage: float, test_percentage: float) -> None: if val_percentage is None or not 0 < val_percentage < 1: - raise ValueError(f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1.") + raise ValueError( + f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1." + ) if test_percentage is None or not 0 < test_percentage < 1: - raise ValueError(f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1.") + raise ValueError( + f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1." + ) if val_percentage + test_percentage >= 1: raise ValueError( - f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. " f"Their sum must be a value x, where x < 1." + f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. " + f"Their sum must be a value x, where x < 1." ) -def _build_split(split_path: Path, stratified_types: List[str], partitions: List[str] = ["train", "val", "test"]) -> Split: +def _build_split( + split_path: Path, + stratified_types: List[str], + partitions: List[str] = ["train", "val", "test"], +) -> Split: split = Split() - split.random = {partition: split_path / f"random_{partition}.txt" for partition in partitions} + split.random = { + partition: split_path / f"random_{partition}.txt" for partition in partitions + } if len(stratified_types) == 0: return split stratified_dict: Dict[str, Dict[str, Path]] = {} for stratified_type in stratified_types: - stratified_dict[stratified_type] = {partition: split_path / f"stratified_{stratified_type}_{partition}.txt" for partition in partitions} + stratified_dict[stratified_type] = { + partition: split_path / f"stratified_{stratified_type}_{partition}.txt" + for partition in partitions + } split.stratified = stratified_dict return split diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index e0228bb9d..fc0dfa20c 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -62,53 +62,55 @@ def get_release_path(dataset_path: Path, release_name: Optional[str] = None) -> def extract_classes( - annotations_path: Path, - annotation_type: Union[str, List[str]] - ) -> Tuple[Dict[str, Set[int]], Dict[int, Set[str]]]: - """ - Given the GT as json files extracts all classes and maps images index to classes. - - Parameters - ---------- - annotations_files : Path - Path to the json files with the GT information of each image. - annotation_type : Union[str, List[str]] - Type(s) of annotation to use to extract the GT information. - - Returns - ------- - Tuple[Dict[str, Set[int]], Dict[int, Set[str]]] - A Tuple where the first element is a ``Dictionary`` where keys are the classes found in the - GT and values are a list of file numbers which contain it; and the second element is - ``Dictionary`` where keys are image indices and values are all classes - contained in that image. - """ - - if isinstance(annotation_type, str): - annotation_types_to_load = [annotation_type] - else: - annotation_types_to_load = annotation_type + annotations_path: Path, annotation_type: Union[str, List[str]] +) -> Tuple[Dict[str, Set[int]], Dict[int, Set[str]]]: + """ + Given the GT as json files extracts all classes and maps images index to classes. - for atype in annotation_types_to_load: - assert atype in ["bounding_box", "polygon", "tag"] + Parameters + ---------- + annotations_files : Path + Path to the json files with the GT information of each image. + annotation_type : Union[str, List[str]] + Type(s) of annotation to use to extract the GT information. - classes: Dict[str, Set[int]] = defaultdict(set) - indices_to_classes: Dict[int, Set[str]] = defaultdict(set) + Returns + ------- + Tuple[Dict[str, Set[int]], Dict[int, Set[str]]] + A Tuple where the first element is a ``Dictionary`` where keys are the classes found in the + GT and values are a list of file numbers which contain it; and the second element is + ``Dictionary`` where keys are image indices and values are all classes + contained in that image. + """ - for i, file_name in enumerate(sorted(annotations_path.glob("**/*.json"))): - annotation_file = parse_path(file_name) - if not annotation_file: - continue + if isinstance(annotation_type, str): + annotation_types_to_load = [annotation_type] + else: + annotation_types_to_load = annotation_type - for annotation in annotation_file.annotations: - if annotation.annotation_class.annotation_type not in annotation_types_to_load: - continue + for atype in annotation_types_to_load: + assert atype in ["bounding_box", "polygon", "tag"] + + classes: Dict[str, Set[int]] = defaultdict(set) + indices_to_classes: Dict[int, Set[str]] = defaultdict(set) + + for i, file_name in enumerate(sorted(annotations_path.glob("**/*.json"))): + annotation_file = parse_path(file_name) + if not annotation_file: + continue + + for annotation in annotation_file.annotations: + if ( + annotation.annotation_class.annotation_type + not in annotation_types_to_load + ): + continue - class_name = annotation.annotation_class.name - indices_to_classes[i].add(class_name) - classes[class_name].add(i) + class_name = annotation.annotation_class.name + indices_to_classes[i].add(class_name) + classes[class_name].add(i) - return classes, indices_to_classes + return classes, indices_to_classes def make_class_lists(release_path: Path) -> None: @@ -190,7 +192,11 @@ def get_classes( classes_file_path = release_path / f"lists/classes_{atype}.txt" class_per_annotations = get_classes_from_file(classes_file_path) - if remove_background and class_per_annotations and class_per_annotations[0] == "__background__": + if ( + remove_background + and class_per_annotations + and class_per_annotations[0] == "__background__" + ): class_per_annotations = class_per_annotations[1:] for cls in class_per_annotations: @@ -198,7 +204,9 @@ def get_classes( classes.append(cls) available_types = available_annotation_types(release_path) - assert len(classes) > 0, f"No classes found for {annotation_type}. Supported types are: {', '.join(available_types)}" + assert ( + len(classes) > 0 + ), f"No classes found for {annotation_type}. Supported types are: {', '.join(available_types)}" return classes @@ -211,7 +219,10 @@ def _f(x: Any) -> Any: def exhaust_generator( - progress: Generator, count: int, multi_threaded: bool, worker_count: Optional[int] = None + progress: Generator, + count: int, + multi_threaded: bool, + worker_count: Optional[int] = None, ) -> Tuple[List[Dict[str, Any]], List[Exception]]: """ Exhausts the generator passed as parameter. Can be done multi threaded if desired. @@ -297,7 +308,9 @@ def get_coco_format_record( objs = [] for obj in data.annotations: if annotation_type != obj.annotation_class.annotation_type: - if annotation_type not in obj.data: # Allows training object detection with bboxes + if ( + annotation_type not in obj.data + ): # Allows training object detection with bboxes continue if annotation_type == "polygon": @@ -340,7 +353,9 @@ def create_polygon_object(obj, box_mode, classes=None): "segmentation": segmentation, "bbox": [np.min(all_px), np.min(all_py), np.max(all_px), np.max(all_py)], "bbox_mode": box_mode, - "category_id": classes.index(obj.annotation_class.name) if classes else obj.annotation_class.name, + "category_id": classes.index(obj.annotation_class.name) + if classes + else obj.annotation_class.name, "iscrowd": 0, } @@ -352,7 +367,9 @@ def create_bbox_object(obj, box_mode, classes=None): new_obj = { "bbox": [bbox["x"], bbox["y"], bbox["x"] + bbox["w"], bbox["y"] + bbox["h"]], "bbox_mode": box_mode, - "category_id": classes.index(obj.annotation_class.name) if classes else obj.annotation_class.name, + "category_id": classes.index(obj.annotation_class.name) + if classes + else obj.annotation_class.name, "iscrowd": 0, } @@ -421,36 +438,54 @@ def get_annotations( _validate_inputs(partition, split_type, annotation_type) - classes = get_classes(dataset_path, release_name, annotation_type=annotation_type, remove_background=True) + classes = get_classes( + dataset_path, + release_name, + annotation_type=annotation_type, + remove_background=True, + ) if partition: - stems = _get_stems_from_split(release_path, split, split_type, annotation_type, partition) + stems = _get_stems_from_split( + release_path, split, split_type, annotation_type, partition + ) else: stems = (e.stem for e in annotations_dir.glob("**/*.json")) - images_paths, annotations_paths, invalid_annotation_paths = _map_annotations_to_images(stems, annotations_dir, images_dir, ignore_inconsistent_examples) + ( + images_paths, + annotations_paths, + invalid_annotation_paths, + ) = _map_annotations_to_images( + stems, annotations_dir, images_dir, ignore_inconsistent_examples + ) print(f"Found {len(invalid_annotation_paths)} invalid annotations") for p in invalid_annotation_paths: print(p) if len(images_paths) == 0: - raise ValueError(f"Could not find any {SUPPORTED_EXTENSIONS} file" f" in {dataset_path / 'images'}") + raise ValueError( + f"Could not find any {SUPPORTED_EXTENSIONS} file" + f" in {dataset_path / 'images'}" + ) assert len(images_paths) == len(annotations_paths) - yield from _load_and_format_annotations(images_paths, annotations_paths, annotation_format, annotation_type, classes) + yield from _load_and_format_annotations( + images_paths, annotations_paths, annotation_format, annotation_type, classes + ) def _validate_inputs(partition, split_type, annotation_type): """ Validates the input parameters for partition, split_type, and annotation_type. - + Args: partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None. split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None. annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'. - + Raises: ValueError: If the input parameters do not match the expected values. """ @@ -459,23 +494,25 @@ def _validate_inputs(partition, split_type, annotation_type): if split_type not in ["random", "stratified", None]: raise ValueError("split_type should be either 'random', 'stratified', or None") if annotation_type not in ["tag", "polygon", "bounding_box"]: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) def _get_stems_from_split(release_path, split, split_type, annotation_type, partition): """ Determines the file stems based on the dataset split and other parameters. - + Args: release_path (Path): Path to the dataset release. split (str): Dataset split identifier. split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None. annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'. partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None. - + Returns: Generator[str]: File stems for the dataset. - + Raises: ValueError: If the split_type is invalid. FileNotFoundError: If the dataset partition file is not found. @@ -500,19 +537,21 @@ def _get_stems_from_split(release_path, split, split_type, annotation_type, part ) -def _map_annotations_to_images(stems, annotations_dir, images_dir, ignore_inconsistent_examples): +def _map_annotations_to_images( + stems, annotations_dir, images_dir, ignore_inconsistent_examples +): """ Maps annotations to their corresponding images based on the file stems. - + Args: stems (List[str]): List of file stems. annotations_dir (Path): Directory containing annotation files. images_dir (Path): Directory containing image files. ignore_inconsistent_examples (bool): Flag to determine if inconsistent examples should be ignored. - + Returns: Tuple[List[Path], List[Path], List[Path]]: Lists of paths for images, annotations, and invalid annotations respectively. - + Raises: ValueError: If there are inconsistencies with the annotations and images. """ @@ -536,9 +575,13 @@ def _map_annotations_to_images(stems, annotations_dir, images_dir, ignore_incons invalid_annotation_paths.append(annotation_path) continue elif image_count < 1: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") + raise ValueError( + f"Annotation ({annotation_path}) does not have a corresponding image" + ) elif image_count > 1: - raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.") + raise ValueError( + f"Image ({stem}) is present with multiple extensions. This is forbidden." + ) images_paths.append(images[0]) annotations_paths.append(annotation_path) @@ -546,28 +589,34 @@ def _map_annotations_to_images(stems, annotations_dir, images_dir, ignore_incons return images_paths, annotations_paths, invalid_annotation_paths -def _load_and_format_annotations(images_paths, annotations_paths, annotation_format, annotation_type, classes): +def _load_and_format_annotations( + images_paths, annotations_paths, annotation_format, annotation_type, classes +): """ Loads and formats annotations based on the specified format and type. - + Args: images_paths (List[Path]): List of paths to image files. annotations_paths (List[Path]): List of paths to annotation files. annotation_format (str): Desired output format for annotations. Can be 'coco' or 'darwin'. annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'. classes (List[str]): List of class names. - + Yields: Dict: Formatted annotation record. - + Notes: - If the annotation format is 'coco', video annotations cannot be loaded and will be skipped. """ if annotation_format == "coco": images_ids = list(range(len(images_paths))) - for annotation_path, image_path, image_id in zip(annotations_paths, images_paths, images_ids): + for annotation_path, image_path, image_id in zip( + annotations_paths, images_paths, images_ids + ): if image_path.suffix.lower() in SUPPORTED_VIDEO_EXTENSIONS: - print(f"[WARNING] Cannot load video annotation into COCO format. Skipping {image_path}") + print( + f"[WARNING] Cannot load video annotation into COCO format. Skipping {image_path}" + ) continue yield get_coco_format_record( annotation_path=annotation_path, @@ -705,24 +754,35 @@ def compute_distributions( - instance_distribution: count of all instances of a given class exist for each partition """ - class_distribution: AnnotationDistribution = {partition: Counter() for partition in partitions} - instance_distribution: AnnotationDistribution = {partition: Counter() for partition in partitions} + class_distribution: AnnotationDistribution = { + partition: Counter() for partition in partitions + } + instance_distribution: AnnotationDistribution = { + partition: Counter() for partition in partitions + } for partition in partitions: for annotation_type in annotation_types: - split_file: Path = split_path / f"stratified_{annotation_type}_{partition}.txt" + split_file: Path = ( + split_path / f"stratified_{annotation_type}_{partition}.txt" + ) if not split_file.exists(): split_file = split_path / f"random_{partition}.txt" stems: List[str] = [e.rstrip("\n\r") for e in split_file.open()] for stem in stems: annotation_path: Path = annotations_dir / f"{stem}.json" - annotation_file: Optional[dt.AnnotationFile] = parse_path(annotation_path) + annotation_file: Optional[dt.AnnotationFile] = parse_path( + annotation_path + ) if annotation_file is None: continue - annotation_class_names: List[str] = [annotation.annotation_class.name for annotation in annotation_file.annotations] + annotation_class_names: List[str] = [ + annotation.annotation_class.name + for annotation in annotation_file.annotations + ] class_distribution[partition] += Counter(set(annotation_class_names)) instance_distribution[partition] += Counter(annotation_class_names) diff --git a/tests/darwin/dataset/dataset_utils_test.py b/tests/darwin/dataset/dataset_utils_test.py index f69e493a7..29157be75 100644 --- a/tests/darwin/dataset/dataset_utils_test.py +++ b/tests/darwin/dataset/dataset_utils_test.py @@ -17,7 +17,9 @@ def open_resource_file(): - resource_file = Path("tests") / "darwin" / "dataset" / "resources" / "stratified_polygon_train" + resource_file = ( + Path("tests") / "darwin" / "dataset" / "resources" / "stratified_polygon_train" + ) return resource_file.open() @@ -31,7 +33,12 @@ def parsed_annotation_file(): {"name": "class_2", "polygon": {"path": []}}, {"name": "class_3", "polygon": {"path": []}}, ], - "image": {"filename": "test.jpg", "height": 1080, "url": "https://darwin.v7labs.com/test.jpg", "width": 1920}, + "image": { + "filename": "test.jpg", + "height": 1080, + "url": "https://darwin.v7labs.com/test.jpg", + "width": 1920, + }, } @@ -66,7 +73,10 @@ def test_builds_correct_mapping_dictionaries(self, annotations_path: Path): payload = { "annotations": [ {"name": "class_1", "polygon": {"path": []}}, - {"name": "class_2", "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}}, + { + "name": "class_2", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, {"name": "class_3", "polygon": {"path": []}}, {"name": "class_4", "tag": {}}, {"name": "class_1", "polygon": {"path": []}}, @@ -78,7 +88,10 @@ def test_builds_correct_mapping_dictionaries(self, annotations_path: Path): payload = { "annotations": [ {"name": "class_5", "polygon": {"path": []}}, - {"name": "class_6", "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}}, + { + "name": "class_6", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, {"name": "class_1", "polygon": {"path": []}}, {"name": "class_4", "tag": {}}, {"name": "class_1", "polygon": {"path": []}}, @@ -110,7 +123,10 @@ def test_extract_multiple_annotation_types(self, annotations_path: Path): { "annotations": [ {"name": "class_1", "polygon": {"path": []}}, - {"name": "class_2", "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}}, + { + "name": "class_2", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, {"name": "class_3", "polygon": {"path": []}}, {"name": "class_4", "tag": {}}, {"name": "class_1", "polygon": {"path": []}}, @@ -124,7 +140,10 @@ def test_extract_multiple_annotation_types(self, annotations_path: Path): { "annotations": [ {"name": "class_5", "polygon": {"path": []}}, - {"name": "class_6", "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}}, + { + "name": "class_6", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, {"name": "class_1", "polygon": {"path": []}}, {"name": "class_4", "tag": {}}, {"name": "class_1", "polygon": {"path": []}}, @@ -134,10 +153,18 @@ def test_extract_multiple_annotation_types(self, annotations_path: Path): ) # Extracting classes for both bounding_box and polygon annotations - class_dict, index_dict = extract_classes(annotations_path, ["polygon", "bounding_box"]) + class_dict, index_dict = extract_classes( + annotations_path, ["polygon", "bounding_box"] + ) # Assertions - assert set(class_dict.keys()) == {"class_1", "class_2", "class_3", "class_5", "class_6"} + assert set(class_dict.keys()) == { + "class_1", + "class_2", + "class_3", + "class_5", + "class_6", + } assert class_dict["class_1"] == {0, 1} assert class_dict["class_2"] == {0} assert class_dict["class_3"] == {0} @@ -154,23 +181,53 @@ def test_normal_filenames_stay_untouched(self): assert sanitize_filename("test.jpg") == "test.jpg" def test_special_characters_are_replaced_with_underscores(self): - assert sanitize_filename("2020-06-18T08<50<13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08>50>13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename('2020-06-18T08"50"13.14815Z.json') == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08/50/13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08\\50\\13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08|50|13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08?50?13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08*50*13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" + assert ( + sanitize_filename("2020-06-18T08<50<13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08>50>13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename('2020-06-18T08"50"13.14815Z.json') + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08/50/13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08\\50\\13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08|50|13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08?50?13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08*50*13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) @patch("platform.system", return_value="Windows") def test_replace_columns_on_windows(self, mock: MagicMock): - assert sanitize_filename("2020-06-18T08:50:13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" + assert ( + sanitize_filename("2020-06-18T08:50:13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) mock.assert_called_once() @patch("platform.system", return_value="Linux") def test_avoid_replacing_columns_on_non_windows(self, mock: MagicMock): - assert sanitize_filename("2020-06-18T08:50:13.14815Z.json") == "2020-06-18T08:50:13.14815Z.json" + assert ( + sanitize_filename("2020-06-18T08:50:13.14815Z.json") + == "2020-06-18T08:50:13.14815Z.json" + ) mock.assert_called_once() @@ -181,7 +238,9 @@ def _create_annotation_file(annotation_path: Path, filename: str, payload: Dict) class TestGetReleasePath: - def test_defaults_to_latest_version_if_no_version_provided(self, team_dataset_path: Path): + def test_defaults_to_latest_version_if_no_version_provided( + self, team_dataset_path: Path + ): latest_release_path = team_dataset_path / "releases" / "latest" latest_release_path.mkdir(parents=True) assert get_release_path(team_dataset_path) == latest_release_path diff --git a/tests/darwin/torch/dataset_test.py b/tests/darwin/torch/dataset_test.py index cc8b1dbbe..4e1a90088 100644 --- a/tests/darwin/torch/dataset_test.py +++ b/tests/darwin/torch/dataset_test.py @@ -28,14 +28,18 @@ def generic_dataset_test(ds, n, size): class TestClassificationDataset: - def test_should_correctly_create_a_single_label_dataset(self, team_slug: str, team_extracted_dataset_path: Path) -> None: + def test_should_correctly_create_a_single_label_dataset( + self, team_slug: str, team_extracted_dataset_path: Path + ) -> None: root = team_extracted_dataset_path / team_slug / "sl" ds = ClassificationDataset(dataset_path=root, release_name="latest") generic_dataset_test(ds, n=20, size=(50, 50)) assert not ds.is_multi_label - def test_should_correctly_create_a_multi_label_dataset(self, team_slug: str, team_extracted_dataset_path: Path) -> None: + def test_should_correctly_create_a_multi_label_dataset( + self, team_slug: str, team_extracted_dataset_path: Path + ) -> None: root = team_extracted_dataset_path / team_slug / "ml" ds = ClassificationDataset(dataset_path=root, release_name="latest") @@ -44,7 +48,9 @@ def test_should_correctly_create_a_multi_label_dataset(self, team_slug: str, tea class TestInstanceSegmentationDataset: - def test_should_correctly_create_a_instance_seg_dataset(self, team_slug: str, team_extracted_dataset_path: Path) -> None: + def test_should_correctly_create_a_instance_seg_dataset( + self, team_slug: str, team_extracted_dataset_path: Path + ) -> None: root = team_extracted_dataset_path / team_slug / "coco" ds = InstanceSegmentationDataset(dataset_path=root, release_name="latest") @@ -53,7 +59,9 @@ def test_should_correctly_create_a_instance_seg_dataset(self, team_slug: str, te class TestSemanticSegmentationDataset: - def test_should_correctly_create_a_semantic_seg_dataset(self, team_slug: str, team_extracted_dataset_path: Path) -> None: + def test_should_correctly_create_a_semantic_seg_dataset( + self, team_slug: str, team_extracted_dataset_path: Path + ) -> None: root = team_extracted_dataset_path / team_slug / "coco" ds = SemanticSegmentationDataset(dataset_path=root, release_name="latest") @@ -62,7 +70,9 @@ def test_should_correctly_create_a_semantic_seg_dataset(self, team_slug: str, te class TestObjectDetectionDataset: - def test_should_correctly_create_a_object_detection_dataset(self, team_slug: str, team_extracted_dataset_path: Path) -> None: + def test_should_correctly_create_a_object_detection_dataset( + self, team_slug: str, team_extracted_dataset_path: Path + ) -> None: root = team_extracted_dataset_path / team_slug / "coco" ds = ObjectDetectionDataset(dataset_path=root, release_name="latest") @@ -85,17 +95,26 @@ def v1_or_v2_slug(request): class TestGetDataset: - def test_exits_when_dataset_not_supported(self, v1_or_v2_slug: str, local_config_file: Config) -> None: + def test_exits_when_dataset_not_supported( + self, v1_or_v2_slug: str, local_config_file: Config + ) -> None: with patch.object(sys, "exit") as exception: get_dataset(f"{v1_or_v2_slug}/test", "unknown") exception.assert_called_once_with(1) - def test_exits_when_dataset_does_not_exist_locally(self, v1_or_v2_slug: str, local_config_file: Config) -> None: + def test_exits_when_dataset_does_not_exist_locally( + self, v1_or_v2_slug: str, local_config_file: Config + ) -> None: with patch.object(sys, "exit") as exception: get_dataset(f"{v1_or_v2_slug}/test", "classification") exception.assert_called_once_with(1) - def test_loads_classification_dataset(self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path) -> None: + def test_loads_classification_dataset( + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, + ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/sl", "classification") assert isinstance(dataset, ClassificationDataset) assert len(dataset) == 20 @@ -104,7 +123,12 @@ def test_loads_classification_dataset(self, v1_or_v2_slug: str, local_config_fil assert image.size() == (3, 50, 50) assert label.item() == 0 - def test_loads_multi_label_classification_dataset(self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path) -> None: + def test_loads_multi_label_classification_dataset( + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, + ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/ml", "classification") assert isinstance(dataset, ClassificationDataset) assert len(dataset) == 20 @@ -115,7 +139,10 @@ def test_loads_multi_label_classification_dataset(self, v1_or_v2_slug: str, loca assert _maybe_tensor_to_list(label) == [1, 0, 1] def test_loads_object_detection_dataset_from_bounding_box_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/bb", "object-detection") assert isinstance(dataset, ObjectDetectionDataset) @@ -127,7 +154,9 @@ def test_loads_object_detection_dataset_from_bounding_box_annotations( label = {k: v.numpy().tolist() for k, v in label.items()} assert label == { - "boxes": [[4, 33, 17, 16]], # we need to account for xywh format and clamping + "boxes": [ + [4, 33, 17, 16] + ], # we need to account for xywh format and clamping "area": [612], "labels": [1], "image_id": [0], @@ -135,7 +164,10 @@ def test_loads_object_detection_dataset_from_bounding_box_annotations( } def test_loads_object_detection_dataset_from_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/coco", "object-detection") assert isinstance(dataset, ObjectDetectionDataset) @@ -146,7 +178,9 @@ def test_loads_object_detection_dataset_from_polygon_annotations( label = {k: v.numpy().tolist() for k, v in label.items()} assert label == { - "boxes": [[4, 33, 17, 16]], # we need to account for xywh format and clamping + "boxes": [ + [4, 33, 17, 16] + ], # we need to account for xywh format and clamping "area": [612], "labels": [1], "image_id": [0], @@ -154,7 +188,10 @@ def test_loads_object_detection_dataset_from_polygon_annotations( } def test_loads_object_detection_dataset_from_complex_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/complex_polygons", "object-detection") assert isinstance(dataset, ObjectDetectionDataset) @@ -173,7 +210,10 @@ def test_loads_object_detection_dataset_from_complex_polygon_annotations( } def test_loads_instance_segmentation_dataset_from_bounding_box_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: # You can load an instance segmentation dataset from an export that only has bounding boxes. # But it will ignore all the annotations, so you'll end up with 0 annotations. @@ -196,7 +236,10 @@ def test_loads_instance_segmentation_dataset_from_bounding_box_annotations( assert label["width"] == 50 def test_loads_instance_segmentation_dataset_from_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/coco", "instance-segmentation") assert isinstance(dataset, InstanceSegmentationDataset) @@ -217,9 +260,14 @@ def test_loads_instance_segmentation_dataset_from_polygon_annotations( assert label["width"] == 50 def test_loads_instance_segmentation_dataset_from_complex_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: - dataset = get_dataset(f"{v1_or_v2_slug}/complex_polygons", "instance-segmentation") + dataset = get_dataset( + f"{v1_or_v2_slug}/complex_polygons", "instance-segmentation" + ) assert isinstance(dataset, InstanceSegmentationDataset) assert len(dataset) == 1 @@ -238,7 +286,10 @@ def test_loads_instance_segmentation_dataset_from_complex_polygon_annotations( assert label["width"] == 50 def test_loads_semantic_segmentation_dataset_from_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/coco", "semantic-segmentation") assert isinstance(dataset, SemanticSegmentationDataset)