diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index c686f516a..285d3e8ba 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -3,7 +3,6 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple import numpy as np -import orjson as json from PIL import Image as PILImage from darwin.dataset.utils import get_classes, get_release_path, load_pil_image @@ -64,20 +63,6 @@ def __init__( split_type: str = "random", release_name: Optional[str] = None, ): - assert dataset_path is not None - release_path = get_release_path(dataset_path, release_name) - annotations_dir = release_path / "annotations" - assert annotations_dir.exists() - images_dir = dataset_path / "images" - assert images_dir.exists() - - if partition not in ["train", "val", "test", None]: - raise ValueError("partition should be either 'train', 'val', or 'test'") - if split_type not in ["random", "stratified"]: - raise ValueError("split_type should be either 'random', 'stratified'") - if annotation_type not in ["tag", "polygon", "bounding_box"]: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") - self.dataset_path = dataset_path self.annotation_type = annotation_type self.images_path: List[Path] = [] @@ -86,15 +71,64 @@ def __init__( self.original_images_path: Optional[List[Path]] = None self.original_annotations_path: Optional[List[Path]] = None + release_path, annotations_dir, images_dir = self._initial_setup( + dataset_path, release_name + ) + self._validate_inputs(partition, split_type, annotation_type) # Get the list of classes + + annotation_types = [self.annotation_type] + # We fetch bounding_boxes annotations from selected polygons as well + if self.annotation_type == "bounding_box": + annotation_types.append("polygon") self.classes = get_classes( - self.dataset_path, release_name, annotation_type=self.annotation_type, remove_background=True + self.dataset_path, + release_name, + annotation_type=annotation_types, + remove_background=True, ) self.num_classes = len(self.classes) + self._setup_annotations_and_images( + release_path, + annotations_dir, + images_dir, + annotation_type, + split, + partition, + split_type, + ) + + if len(self.images_path) == 0: + raise ValueError( + f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", + f" in {images_dir}", + ) + + assert len(self.images_path) == len(self.annotations_path) - stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type) + def _validate_inputs(self, partition, split_type, annotation_type): + if partition not in ["train", "val", "test", None]: + raise ValueError("partition should be either 'train', 'val', or 'test'") + if split_type not in ["random", "stratified"]: + raise ValueError("split_type should be either 'random', 'stratified'") + if annotation_type not in ["tag", "polygon", "bounding_box"]: + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - # Find all the annotations and their corresponding images + def _setup_annotations_and_images( + self, + release_path, + annotations_dir, + images_dir, + annotation_type, + split, + partition, + split_type, + ): + stems = build_stems( + release_path, annotations_dir, annotation_type, split, partition, split_type + ) for stem in stems: annotation_path = annotations_dir / f"{stem}.json" images = [] @@ -107,16 +141,24 @@ def __init__( if image_path.exists(): images.append(image_path) if len(images) < 1: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") + raise ValueError( + f"Annotation ({annotation_path}) does not have a corresponding image" + ) if len(images) > 1: - raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.") + raise ValueError( + f"Image ({stem}) is present with multiple extensions. This is forbidden." + ) self.images_path.append(images[0]) self.annotations_path.append(annotation_path) - if len(self.images_path) == 0: - raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}") - - assert len(self.images_path) == len(self.annotations_path) + def _initial_setup(self, dataset_path, release_name): + assert dataset_path is not None + release_path = get_release_path(dataset_path, release_name) + annotations_dir = release_path / "annotations" + assert annotations_dir.exists() + images_dir = dataset_path / "images" + assert images_dir.exists() + return release_path, annotations_dir, images_dir def get_img_info(self, index: int) -> Dict[str, Any]: """ @@ -166,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]: parsed = parse_darwin_json(self.annotations_path[index], index) return parsed.image_height, parsed.image_width - def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset": + def extend( + self, dataset: "LocalDataset", extend_classes: bool = False + ) -> "LocalDataset": """ Extends the current dataset with another one. @@ -261,7 +305,10 @@ def parse_json(self, index: int) -> Dict[str, Any]: # Filter out unused classes and annotations of a different type if self.classes is not None: annotations = [ - a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a) + a + for a in annotations + if a.annotation_class.name in self.classes + and self.annotation_type_supported(a) ] return { "image_id": index, @@ -278,15 +325,20 @@ def annotation_type_supported(self, annotation) -> bool: elif self.annotation_type == "bounding_box": is_bounding_box = annotation_type == "bounding_box" is_supported_polygon = ( - annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data + annotation_type in ["polygon", "complex_polygon"] + and "bounding_box" in annotation.data ) return is_bounding_box or is_supported_polygon elif self.annotation_type == "polygon": return annotation_type in ["polygon", "complex_polygon"] else: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]: + def measure_mean_std( + self, multi_threaded: bool = True + ) -> Tuple[np.ndarray, np.ndarray]: """ Computes mean and std of trained images, given the train loader. @@ -309,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np. results = pool.map(self._return_mean, self.images_path) mean = np.sum(np.array(results), axis=0) / len(self.images_path) # Online image_classification deviation - results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path]) + results = pool.starmap( + self._return_std, [[item, mean] for item in self.images_path] + ) std_sum = np.sum(np.array([item[0] for item in results]), axis=0) total_pixel_count = np.sum(np.array([item[1] for item in results])) std = np.sqrt(std_sum / total_pixel_count) @@ -355,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray: @staticmethod def _return_mean(image_path: Path) -> np.ndarray: img = np.array(load_pil_image(image_path)) - mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]) + mean = np.array( + [np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])] + ) return mean / 255.0 # Loads an image with OpenCV and returns the channel wise std of the image. @staticmethod def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]: img = np.array(load_pil_image(image_path)) / 255.0 - m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]])) + m2 = np.square( + np.array( + [img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]] + ) + ) return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0 def __getitem__(self, index: int): @@ -432,7 +492,10 @@ def build_stems( """ if partition is None: - return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json"))) + return ( + str(e.relative_to(annotations_dir).parent / e.stem) + for e in sorted(annotations_dir.glob("**/*.json")) + ) if split_type == "random": split_filename = f"{split_type}_{partition}.txt" diff --git a/darwin/dataset/split_manager.py b/darwin/dataset/split_manager.py index b831d2dc0..df7416c40 100644 --- a/darwin/dataset/split_manager.py +++ b/darwin/dataset/split_manager.py @@ -228,7 +228,12 @@ def _stratified_split( return for stratified_type in stratified_types: - _, idx_to_classes = extract_classes(annotation_path, stratified_type) + if stratified_type == "bounding_box": + class_annotation_types = [stratified_type, "polygon"] + else: + class_annotation_types = stratified_type + + _, idx_to_classes = extract_classes(annotation_path, class_annotation_types) if len(idx_to_classes) == 0: continue @@ -252,13 +257,32 @@ def _stratified_split( else: test_indices.append(idx) - _write_to_file(annotation_path, annotation_files, split[stratified_type]["train"], train_indices) - _write_to_file(annotation_path, annotation_files, split[stratified_type]["val"], val_indices) - _write_to_file(annotation_path, annotation_files, split[stratified_type]["test"], test_indices) + _write_to_file( + annotation_path, + annotation_files, + split[stratified_type]["train"], + train_indices, + ) + _write_to_file( + annotation_path, + annotation_files, + split[stratified_type]["val"], + val_indices, + ) + _write_to_file( + annotation_path, + annotation_files, + split[stratified_type]["test"], + test_indices, + ) def _stratify_samples( - idx_to_classes: Dict[int, Set[str]], split_seed: int, train_size: int, val_size: int, test_size: int + idx_to_classes: Dict[int, Set[str]], + split_seed: int, + train_size: int, + val_size: int, + test_size: int, ) -> Tuple[List[int], List[int], List[int]]: """Splits the list of indices into train, val and test according to their labels (stratified) @@ -292,8 +316,8 @@ def _stratify_samples( # Extract entries whose support set is 1 (it would make sklearn crash) and append the to train later unique_labels, count = np.unique(labels, return_counts=True) single_files = [] - for l in unique_labels[count == 1]: - index = np.where(labels == l)[0][0] + for label in unique_labels[count == 1]: + index = np.where(labels == label)[0][0] single_files.append(file_indices[index]) labels = np.delete(labels, index) file_indices = np.delete(file_indices, index) @@ -330,7 +354,11 @@ def _stratify_samples( # Remove duplicates within the same set # NOTE: doing that earlier (e.g. in _remove_cross_contamination()) would produce mathematical # mistakes in the class balancing between validation and test sets. - return (list(set(X_train.astype(int))), list(set(X_val.astype(int))), list(set(X_test.astype(int)))) + return ( + list(set(X_train.astype(int))), + list(set(X_val.astype(int))), + list(set(X_test.astype(int))), + ) def _remove_cross_contamination( @@ -390,20 +418,33 @@ def _unique(array: np.ndarray) -> np.ndarray: return array[sorted(indexes)] -def _write_to_file(annotation_path: Path, annotation_files: List[Path], file_path: Path, split_idx: Iterable) -> None: +def _write_to_file( + annotation_path: Path, + annotation_files: List[Path], + file_path: Path, + split_idx: Iterable, +) -> None: with open(str(file_path), "w") as f: for i in split_idx: # To deal with recursive search, we want to write the difference between the annotation path # and its parent, without the file extension - stem = str(annotation_files[i]).replace(f"{annotation_path}/", "").rsplit(".json", 1)[0] + stem = ( + str(annotation_files[i]) + .replace(f"{annotation_path}/", "") + .rsplit(".json", 1)[0] + ) f.write(f"{stem}\n") def _validate_split(val_percentage: float, test_percentage: float) -> None: if val_percentage is None or not 0 < val_percentage < 1: - raise ValueError(f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1.") + raise ValueError( + f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1." + ) if test_percentage is None or not 0 < test_percentage < 1: - raise ValueError(f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1.") + raise ValueError( + f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1." + ) if val_percentage + test_percentage >= 1: raise ValueError( f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. " @@ -412,18 +453,23 @@ def _validate_split(val_percentage: float, test_percentage: float) -> None: def _build_split( - split_path: Path, stratified_types: List[str], partitions: List[str] = ["train", "val", "test"] + split_path: Path, + stratified_types: List[str], + partitions: List[str] = ["train", "val", "test"], ) -> Split: split = Split() - split.random = {partition: split_path / f"random_{partition}.txt" for partition in partitions} + split.random = { + partition: split_path / f"random_{partition}.txt" for partition in partitions + } if len(stratified_types) == 0: return split stratified_dict: Dict[str, Dict[str, Path]] = {} for stratified_type in stratified_types: stratified_dict[stratified_type] = { - partition: split_path / f"stratified_{stratified_type}_{partition}.txt" for partition in partitions + partition: split_path / f"stratified_{stratified_type}_{partition}.txt" + for partition in partitions } split.stratified = stratified_dict return split diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index 3f3bb865f..f9802bea8 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -5,7 +5,6 @@ from typing import Any, Dict, Generator, Iterator, List, Optional, Set, Tuple, Union import numpy as np -import orjson as json from PIL import Image as PILImage from rich.live import Live from rich.progress import ProgressBar, track @@ -62,16 +61,18 @@ def get_release_path(dataset_path: Path, release_name: Optional[str] = None) -> return release_path -def extract_classes(annotations_path: Path, annotation_type: str) -> Tuple[Dict[str, Set[int]], Dict[int, Set[str]]]: +def extract_classes( + annotations_path: Path, annotation_type: Union[str, List[str]] +) -> Tuple[Dict[str, Set[int]], Dict[int, Set[str]]]: """ - Given a the GT as json files extracts all classes and an maps images index to classes. + Given the GT as json files extracts all classes and maps images index to classes. Parameters ---------- annotations_files : Path Path to the json files with the GT information of each image. - annotation_type : str - Type of annotation to use to extract the Gt information. + annotation_type : Union[str, List[str]] + Type(s) of annotation to use to extract the GT information. Returns ------- @@ -82,7 +83,13 @@ def extract_classes(annotations_path: Path, annotation_type: str) -> Tuple[Dict[ contained in that image. """ - assert annotation_type in ["bounding_box", "polygon", "tag"] + if isinstance(annotation_type, str): + annotation_types_to_load = [annotation_type] + else: + annotation_types_to_load = annotation_type + + for atype in annotation_types_to_load: + assert atype in ["bounding_box", "polygon", "tag"] classes: Dict[str, Set[int]] = defaultdict(set) indices_to_classes: Dict[int, Set[str]] = defaultdict(set) @@ -93,7 +100,10 @@ def extract_classes(annotations_path: Path, annotation_type: str) -> Tuple[Dict[ continue for annotation in annotation_file.annotations: - if annotation.annotation_class.annotation_type != annotation_type: + if ( + annotation.annotation_class.annotation_type + not in annotation_types_to_load + ): continue class_name = annotation.annotation_class.name @@ -131,10 +141,23 @@ def make_class_lists(release_path: Path) -> None: f.write("\n".join(classes_names)) +def get_classes_from_file(path: Path) -> List[str]: + """Helper function to read class names from a file.""" + if path.exists(): + return path.read_text().splitlines() + return [] + + +def available_annotation_types(release_path: Path) -> List[str]: + """Returns a list of available annotation types based on the existing files.""" + files = [p.name for p in release_path.glob("lists/classes_*.txt")] + return [f[len("classes_") : -len(".txt")] for f in files] + + def get_classes( dataset_path: PathLike, release_name: Optional[str] = None, - annotation_type: str = "polygon", + annotation_type: Union[str, List[str]] = "polygon", remove_background: bool = True, ) -> List[str]: """ @@ -147,7 +170,7 @@ def get_classes( release_name : Optional[str], default: None Version of the dataset. annotation_type : str, default: "polygon" - The type of annotation classes [tag, polygon]. + The type of annotation classes [tag, polygon, bounding_box]. remove_background : bool, default: True Removes the background class (if exists) from the list of classes. @@ -159,11 +182,32 @@ def get_classes( assert dataset_path is not None dataset_path = Path(dataset_path) release_path = get_release_path(dataset_path, release_name) + if isinstance(annotation_type, str): + annotation_types_to_load = [annotation_type] + else: + annotation_types_to_load = annotation_type + + classes = [] # Use a list to maintain order + for atype in annotation_types_to_load: + classes_file_path = release_path / f"lists/classes_{atype}.txt" + + class_per_annotations = get_classes_from_file(classes_file_path) + if ( + remove_background + and class_per_annotations + and class_per_annotations[0] == "__background__" + ): + class_per_annotations = class_per_annotations[1:] + + for cls in class_per_annotations: + if cls not in classes: # Only add if it's not already in the list + classes.append(cls) + + available_types = available_annotation_types(release_path) + assert ( + len(classes) > 0 + ), f"No classes found for {annotation_type}. Supported types are: {', '.join(available_types)}" - classes_path = release_path / f"lists/classes_{annotation_type}.txt" - classes = classes_path.read_text().splitlines() - if remove_background and classes[0] == "__background__": - classes = classes[1:] return classes @@ -175,28 +219,42 @@ def _f(x: Any) -> Any: def exhaust_generator( - progress: Generator, count: int, multi_threaded: bool, worker_count: Optional[int] = None + progress: Generator, + count: int, + multi_threaded: bool, + worker_count: Optional[int] = None, ) -> Tuple[List[Dict[str, Any]], List[Exception]]: """ - Exhausts the generator passed as parameter. Can be done multi threaded if desired. + Exhausts the generator passed as parameter. Can be done multi threaded if desired. + Creates and returns a coco record from the given annotation. + + Uses ``BoxMode.XYXY_ABS`` from ``detectron2.structures`` if available, defaults to ``box_mode = 0`` + otherwise. Parameters ---------- - progress : Generator - Generator to exhaust. - count : int - Size of the generator. - multi_threaded : bool - Flag for multi-threaded enabled operations. - worker_count : Optional[int] - Number of workers to use if multi_threaded=True. By default CPU count is used. - + annotation_path : Path + ``Path`` to the annotation file. + annotation_type : str = "polygon" + Type of the annotation we want to retrieve. + image_path : Optional[Path], default: None + ``Path`` to the image the annotation refers to. + image_id : Optional[Union[str, int]], default: None + Id of the image the annotation refers to. + classes : Optional[List[str]], default: None + Classes of the annotation. Returns ------- - List[Dict[str, Any] - List of responses from the generator execution. - List[Exception] - List of exceptions raised during the execution of the generator. + Dict[str, Any] + A coco record with the following keys: + .. code-block:: python + { + "height": 100, + "width": 100, + "file_name": "a file name", + "image_id": 1, + "annotations": [ ... ] + } """ successes = [] errors = [] @@ -238,46 +296,15 @@ def get_coco_format_record( image_id: Optional[Union[str, int]] = None, classes: Optional[List[str]] = None, ) -> Dict[str, Any]: - """ - Creates and returns a coco record from the given annotation. - Uses ``BoxMode.XYXY_ABS`` from ``detectron2.structures`` if available, defaults to ``box_mode = 0`` - otherwise. - - Parameters - ---------- - annotation_path : Path - ``Path`` to the annotation file. - annotation_type : str = "polygon" - Type of the annotation we want to retrieve. - image_path : Optional[Path], default: None - ``Path`` to the image the annotation refers to. - image_id : Optional[Union[str, int]], default: None - Id of the image the annotation refers to. - classes : Optional[List[str]], default: None - Classes of the annotation. - - Returns - ------- - Dict[str, Any] - A coco record with the following keys: - - .. code-block:: python - - { - "height": 100, - "width": 100, - "file_name": "a file name", - "image_id": 1, - "annotations": [ ... ] - } - """ assert annotation_type in ["tag", "polygon", "bounding_box"] + try: from detectron2.structures import BoxMode box_mode = BoxMode.XYXY_ABS except ImportError: box_mode = 0 + data = parse_darwin_json(annotation_path) record: Dict[str, Any] = {} @@ -285,55 +312,81 @@ def get_coco_format_record( record["file_name"] = str(image_path) if image_id is not None: record["image_id"] = image_id + record["height"] = data.image_height record["width"] = data.image_width objs = [] for obj in data.annotations: if annotation_type != obj.annotation_class.annotation_type: - if annotation_type not in obj.data: # Allows training object detection with bboxes + if ( + annotation_type not in obj.data + ): # Allows training object detection with bboxes continue - if classes: - category = classes.index(obj.annotation_class.name) - else: - category = obj.annotation_class.name - new_obj = {"bbox_mode": box_mode, "category_id": category, "iscrowd": 0} - if annotation_type == "polygon": - # Support for complex polygons - if "paths" in obj.data: - paths = obj.data["paths"] - elif "path" in obj.data: - paths = [obj.data["path"]] - else: - raise ValueError("polygon path not found") - all_px, all_py = [], [] - segmentation = [] - - for path in paths: - if len(path) < 3: # Discard polygons with less than 3 points - continue - px, py = [], [] - for point in path: - px.append(point["x"]) - py.append(point["y"]) - poly = [(x, y) for x, y in zip(px, py)] - segmentation.append(list(itertools.chain.from_iterable(poly))) - all_px.extend(px) - all_py.extend(py) - - new_obj["segmentation"] = segmentation - new_obj["bbox"] = [np.min(all_px), np.min(all_py), np.max(all_px), np.max(all_py)] + new_obj = create_polygon_object(obj, box_mode, classes) elif annotation_type == "bounding_box": - bbox = obj.data["bounding_box"] - new_obj["bbox"] = [bbox["x"], bbox["y"], bbox["x"] + bbox["w"], bbox["y"] + bbox["h"]] + new_obj = create_bbox_object(obj, box_mode, classes) + else: + continue objs.append(new_obj) + record["annotations"] = objs return record +def create_polygon_object(obj, box_mode, classes=None): + if "paths" in obj.data: + paths = obj.data["paths"] + elif "path" in obj.data: + paths = [obj.data["path"]] + else: + raise ValueError("polygon path not found") + + all_px, all_py = [], [] + segmentation = [] + + for path in paths: + if len(path) < 3: + continue + px, py = [], [] + for point in path: + px.append(point["x"]) + py.append(point["y"]) + poly = list(zip(px, py)) + segmentation.append(list(itertools.chain.from_iterable(poly))) + all_px.extend(px) + all_py.extend(py) + + new_obj = { + "segmentation": segmentation, + "bbox": [np.min(all_px), np.min(all_py), np.max(all_px), np.max(all_py)], + "bbox_mode": box_mode, + "category_id": classes.index(obj.annotation_class.name) + if classes + else obj.annotation_class.name, + "iscrowd": 0, + } + + return new_obj + + +def create_bbox_object(obj, box_mode, classes=None): + bbox = obj.data["bounding_box"] + new_obj = { + "bbox": [bbox["x"], bbox["y"], bbox["x"] + bbox["w"], bbox["y"] + bbox["h"]], + "bbox_mode": box_mode, + "category_id": classes.index(obj.annotation_class.name) + if classes + else obj.annotation_class.name, + "iscrowd": 0, + } + + return new_obj + + def get_annotations( dataset_path: PathLike, partition: Optional[str] = None, @@ -389,50 +442,132 @@ def get_annotations( dataset_path = Path(dataset_path) release_path: Path = get_release_path(dataset_path, release_name) - annotations_dir = release_path / "annotations" assert annotations_dir.exists() images_dir = dataset_path / "images" assert images_dir.exists() + _validate_inputs(partition, split_type, annotation_type) + + classes = get_classes( + dataset_path, + release_name, + annotation_type=annotation_type, + remove_background=True, + ) + + if partition: + stems = _get_stems_from_split( + release_path, split, split_type, annotation_type, partition + ) + else: + stems = (e.stem for e in annotations_dir.glob("**/*.json")) + + ( + images_paths, + annotations_paths, + invalid_annotation_paths, + ) = _map_annotations_to_images( + stems, annotations_dir, images_dir, ignore_inconsistent_examples + ) + + print(f"Found {len(invalid_annotation_paths)} invalid annotations") + for p in invalid_annotation_paths: + print(p) + + if len(images_paths) == 0: + raise ValueError( + f"Could not find any {SUPPORTED_EXTENSIONS} file" + f" in {dataset_path / 'images'}" + ) + + assert len(images_paths) == len(annotations_paths) + + yield from _load_and_format_annotations( + images_paths, annotations_paths, annotation_format, annotation_type, classes + ) + + +def _validate_inputs(partition, split_type, annotation_type): + """ + Validates the input parameters for partition, split_type, and annotation_type. + + Args: + partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None. + split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None. + annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'. + + Raises: + ValueError: If the input parameters do not match the expected values. + """ if partition not in ["train", "val", "test", None]: raise ValueError("partition should be either 'train', 'val', 'test', or None") if split_type not in ["random", "stratified", None]: raise ValueError("split_type should be either 'random', 'stratified', or None") if annotation_type not in ["tag", "polygon", "bounding_box"]: - raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") + raise ValueError( + "annotation_type should be either 'tag', 'bounding_box', or 'polygon'" + ) - # Get the list of classes - classes = get_classes(dataset_path, release_name, annotation_type=annotation_type, remove_background=True) - # Get the list of stems - if partition: - # Get the split - if split_type is None: - split_file = f"{partition}.txt" - elif split_type == "random": - split_file = f"{split_type}_{partition}.txt" - elif split_type == "stratified": - split_file = f"{split_type}_{annotation_type}_{partition}.txt" - else: - raise ValueError(f"Invalid split_type ({split_type})") - split_path: Path = release_path / "lists" / str(split) / split_file +def _get_stems_from_split(release_path, split, split_type, annotation_type, partition): + """ + Determines the file stems based on the dataset split and other parameters. - if split_path.is_file(): - stems: Iterator[str] = (e.rstrip("\n\r") for e in split_path.open()) - else: - raise FileNotFoundError( - "Could not find a dataset partition. ", - "To split the dataset you can use 'split_dataset' from darwin.dataset.split_manager", - ) + Args: + release_path (Path): Path to the dataset release. + split (str): Dataset split identifier. + split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None. + annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'. + partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None. + + Returns: + Generator[str]: File stems for the dataset. + + Raises: + ValueError: If the split_type is invalid. + FileNotFoundError: If the dataset partition file is not found. + """ + if split_type is None: + split_file = f"{partition}.txt" + elif split_type == "random": + split_file = f"{split_type}_{partition}.txt" + elif split_type == "stratified": + split_file = f"{split_type}_{annotation_type}_{partition}.txt" else: - # If the partition is not specified, get all the annotations - stems = (e.stem for e in annotations_dir.glob("**/*.json")) + raise ValueError(f"Invalid split_type ({split_type})") + split_path: Path = release_path / "lists" / str(split) / split_file + + if split_path.is_file(): + return (e.rstrip("\n\r") for e in split_path.open()) + else: + raise FileNotFoundError( + "Could not find a dataset partition. ", + "To split the dataset you can use 'split_dataset' from darwin.dataset.split_manager", + ) + + +def _map_annotations_to_images( + stems, annotations_dir, images_dir, ignore_inconsistent_examples +): + """ + Maps annotations to their corresponding images based on the file stems. + + Args: + stems (List[str]): List of file stems. + annotations_dir (Path): Directory containing annotation files. + images_dir (Path): Directory containing image files. + ignore_inconsistent_examples (bool): Flag to determine if inconsistent examples should be ignored. + + Returns: + Tuple[List[Path], List[Path], List[Path]]: Lists of paths for images, annotations, and invalid annotations respectively. + + Raises: + ValueError: If there are inconsistencies with the annotations and images. + """ images_paths = [] annotations_paths = [] - - # Find all the annotations and their corresponding images invalid_annotation_paths = [] for stem in stems: annotation_path = annotations_dir / f"{stem}.json" @@ -451,28 +586,48 @@ def get_annotations( invalid_annotation_paths.append(annotation_path) continue elif image_count < 1: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") + raise ValueError( + f"Annotation ({annotation_path}) does not have a corresponding image" + ) elif image_count > 1: - raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.") + raise ValueError( + f"Image ({stem}) is present with multiple extensions. This is forbidden." + ) images_paths.append(images[0]) annotations_paths.append(annotation_path) - print(f"Found {len(invalid_annotation_paths)} invalid annotations") - for p in invalid_annotation_paths: - print(p) + return images_paths, annotations_paths, invalid_annotation_paths - if len(images_paths) == 0: - raise ValueError(f"Could not find any {SUPPORTED_EXTENSIONS} file" f" in {dataset_path / 'images'}") - assert len(images_paths) == len(annotations_paths) +def _load_and_format_annotations( + images_paths, annotations_paths, annotation_format, annotation_type, classes +): + """ + Loads and formats annotations based on the specified format and type. - # Load and re-format all the annotations + Args: + images_paths (List[Path]): List of paths to image files. + annotations_paths (List[Path]): List of paths to annotation files. + annotation_format (str): Desired output format for annotations. Can be 'coco' or 'darwin'. + annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'. + classes (List[str]): List of class names. + + Yields: + Dict: Formatted annotation record. + + Notes: + - If the annotation format is 'coco', video annotations cannot be loaded and will be skipped. + """ if annotation_format == "coco": images_ids = list(range(len(images_paths))) - for annotation_path, image_path, image_id in zip(annotations_paths, images_paths, images_ids): + for annotation_path, image_path, image_id in zip( + annotations_paths, images_paths, images_ids + ): if image_path.suffix.lower() in SUPPORTED_VIDEO_EXTENSIONS: - print(f"[WARNING] Cannot load video annotation into COCO format. Skipping {image_path}") + print( + f"[WARNING] Cannot load video annotation into COCO format. Skipping {image_path}" + ) continue yield get_coco_format_record( annotation_path=annotation_path, @@ -610,25 +765,34 @@ def compute_distributions( - instance_distribution: count of all instances of a given class exist for each partition """ - class_distribution: AnnotationDistribution = {partition: Counter() for partition in partitions} - instance_distribution: AnnotationDistribution = {partition: Counter() for partition in partitions} + class_distribution: AnnotationDistribution = { + partition: Counter() for partition in partitions + } + instance_distribution: AnnotationDistribution = { + partition: Counter() for partition in partitions + } for partition in partitions: for annotation_type in annotation_types: - split_file: Path = split_path / f"stratified_{annotation_type}_{partition}.txt" + split_file: Path = ( + split_path / f"stratified_{annotation_type}_{partition}.txt" + ) if not split_file.exists(): split_file = split_path / f"random_{partition}.txt" stems: List[str] = [e.rstrip("\n\r") for e in split_file.open()] for stem in stems: annotation_path: Path = annotations_dir / f"{stem}.json" - annotation_file: Optional[dt.AnnotationFile] = parse_path(annotation_path) + annotation_file: Optional[dt.AnnotationFile] = parse_path( + annotation_path + ) if annotation_file is None: continue annotation_class_names: List[str] = [ - annotation.annotation_class.name for annotation in annotation_file.annotations + annotation.annotation_class.name + for annotation in annotation_file.annotations ] class_distribution[partition] += Counter(set(annotation_class_names)) diff --git a/tests/darwin/dataset/dataset_utils_test.py b/tests/darwin/dataset/dataset_utils_test.py index 88548bde3..29157be75 100644 --- a/tests/darwin/dataset/dataset_utils_test.py +++ b/tests/darwin/dataset/dataset_utils_test.py @@ -1,6 +1,6 @@ import shutil from pathlib import Path -from typing import Dict, Generator, List +from typing import Dict from unittest.mock import MagicMock, patch import orjson as json @@ -17,7 +17,9 @@ def open_resource_file(): - resource_file = Path("tests") / "darwin" / "dataset" / "resources" / "stratified_polygon_train" + resource_file = ( + Path("tests") / "darwin" / "dataset" / "resources" / "stratified_polygon_train" + ) return resource_file.open() @@ -31,7 +33,12 @@ def parsed_annotation_file(): {"name": "class_2", "polygon": {"path": []}}, {"name": "class_3", "polygon": {"path": []}}, ], - "image": {"filename": "test.jpg", "height": 1080, "url": "https://darwin.v7labs.com/test.jpg", "width": 1920}, + "image": { + "filename": "test.jpg", + "height": 1080, + "url": "https://darwin.v7labs.com/test.jpg", + "width": 1920, + }, } @@ -66,7 +73,10 @@ def test_builds_correct_mapping_dictionaries(self, annotations_path: Path): payload = { "annotations": [ {"name": "class_1", "polygon": {"path": []}}, - {"name": "class_2", "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}}, + { + "name": "class_2", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, {"name": "class_3", "polygon": {"path": []}}, {"name": "class_4", "tag": {}}, {"name": "class_1", "polygon": {"path": []}}, @@ -78,7 +88,10 @@ def test_builds_correct_mapping_dictionaries(self, annotations_path: Path): payload = { "annotations": [ {"name": "class_5", "polygon": {"path": []}}, - {"name": "class_6", "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}}, + { + "name": "class_6", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, {"name": "class_1", "polygon": {"path": []}}, {"name": "class_4", "tag": {}}, {"name": "class_1", "polygon": {"path": []}}, @@ -86,11 +99,11 @@ def test_builds_correct_mapping_dictionaries(self, annotations_path: Path): "image": {"filename": "1.jpg"}, } _create_annotation_file(annotations_path, "1.json", payload) - class_dict, index_dict = extract_classes(annotations_path, "polygon") - assert dict(class_dict) == {"class_1": {0, 1}, "class_3": {0}, "class_5": {1}} - assert dict(index_dict) == {0: {"class_1", "class_3"}, 1: {"class_1", "class_5"}} + assert set(index_dict.keys()) == {0, 1} + assert index_dict[0] == {"class_1", "class_3"} + assert index_dict[1] == {"class_1", "class_5"} class_dict, index_dict = extract_classes(annotations_path, "bounding_box") @@ -102,29 +115,119 @@ def test_builds_correct_mapping_dictionaries(self, annotations_path: Path): assert dict(class_dict) == {"class_4": {0, 1}} assert dict(index_dict) == {0: {"class_4"}, 1: {"class_4"}} + def test_extract_multiple_annotation_types(self, annotations_path: Path): + # Provided payloads + _create_annotation_file( + annotations_path, + "0.json", + { + "annotations": [ + {"name": "class_1", "polygon": {"path": []}}, + { + "name": "class_2", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, + {"name": "class_3", "polygon": {"path": []}}, + {"name": "class_4", "tag": {}}, + {"name": "class_1", "polygon": {"path": []}}, + ], + "image": {"filename": "0.jpg"}, + }, + ) + _create_annotation_file( + annotations_path, + "1.json", + { + "annotations": [ + {"name": "class_5", "polygon": {"path": []}}, + { + "name": "class_6", + "bounding_box": {"x": 0, "y": 0, "w": 100, "h": 100}, + }, + {"name": "class_1", "polygon": {"path": []}}, + {"name": "class_4", "tag": {}}, + {"name": "class_1", "polygon": {"path": []}}, + ], + "image": {"filename": "1.jpg"}, + }, + ) + + # Extracting classes for both bounding_box and polygon annotations + class_dict, index_dict = extract_classes( + annotations_path, ["polygon", "bounding_box"] + ) + + # Assertions + assert set(class_dict.keys()) == { + "class_1", + "class_2", + "class_3", + "class_5", + "class_6", + } + assert class_dict["class_1"] == {0, 1} + assert class_dict["class_2"] == {0} + assert class_dict["class_3"] == {0} + assert class_dict["class_5"] == {1} + assert class_dict["class_6"] == {1} + + assert set(index_dict.keys()) == {0, 1} + assert index_dict[0] == {"class_1", "class_2", "class_3"} + assert index_dict[1] == {"class_1", "class_5", "class_6"} + class TestSanitizeFilename: def test_normal_filenames_stay_untouched(self): assert sanitize_filename("test.jpg") == "test.jpg" def test_special_characters_are_replaced_with_underscores(self): - assert sanitize_filename("2020-06-18T08<50<13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08>50>13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename('2020-06-18T08"50"13.14815Z.json') == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08/50/13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08\\50\\13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08|50|13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08?50?13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" - assert sanitize_filename("2020-06-18T08*50*13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" + assert ( + sanitize_filename("2020-06-18T08<50<13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08>50>13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename('2020-06-18T08"50"13.14815Z.json') + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08/50/13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08\\50\\13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08|50|13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08?50?13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) + assert ( + sanitize_filename("2020-06-18T08*50*13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) @patch("platform.system", return_value="Windows") def test_replace_columns_on_windows(self, mock: MagicMock): - assert sanitize_filename("2020-06-18T08:50:13.14815Z.json") == "2020-06-18T08_50_13.14815Z.json" + assert ( + sanitize_filename("2020-06-18T08:50:13.14815Z.json") + == "2020-06-18T08_50_13.14815Z.json" + ) mock.assert_called_once() @patch("platform.system", return_value="Linux") def test_avoid_replacing_columns_on_non_windows(self, mock: MagicMock): - assert sanitize_filename("2020-06-18T08:50:13.14815Z.json") == "2020-06-18T08:50:13.14815Z.json" + assert ( + sanitize_filename("2020-06-18T08:50:13.14815Z.json") + == "2020-06-18T08:50:13.14815Z.json" + ) mock.assert_called_once() @@ -135,7 +238,9 @@ def _create_annotation_file(annotation_path: Path, filename: str, payload: Dict) class TestGetReleasePath: - def test_defaults_to_latest_version_if_no_version_provided(self, team_dataset_path: Path): + def test_defaults_to_latest_version_if_no_version_provided( + self, team_dataset_path: Path + ): latest_release_path = team_dataset_path / "releases" / "latest" latest_release_path.mkdir(parents=True) assert get_release_path(team_dataset_path) == latest_release_path @@ -184,3 +289,42 @@ def test_passes_back_exceptions(self): assert len(successes) == 1 assert isinstance(errors[0], Exception) assert errors[0].args[0] == "Test" + + +''' +class TestGetAnnotations: + def test_basic_functionality( + self, + team_extracted_dataset_path, + team_dataset_release_path, + annotations_path, + split_path + ): + """ + Basic functionality test for the `get_annotations` function. + """ + + # Test with basic setup + annotations = list(get_annotations(dataset_path=team_extracted_dataset_path)) + assert len(annotations) > 0, "Expected to find some annotations" + + # Add more assertions here to validate the structure of the returned annotations + + def test_partition_handling( + self, + team_extracted_dataset_path, + team_dataset_release_path, + annotations_path, + split_path + ): + """ + Test the partition handling of the `get_annotations` function. + """ + + # Assuming there's a train partition in the test dataset + annotations = list(get_annotations(dataset_path=team_extracted_dataset_path, partition="train")) + assert len(annotations) > 0, "Expected to find some annotations for the train partition" + + # Add more assertions here to validate the structure of the returned annotations + # Repeat for other partitions (e.g., val, test) if present in the mock data +''' diff --git a/tests/darwin/torch/dataset_test.py b/tests/darwin/torch/dataset_test.py index 50484c00c..4e1a90088 100644 --- a/tests/darwin/torch/dataset_test.py +++ b/tests/darwin/torch/dataset_test.py @@ -15,7 +15,7 @@ SemanticSegmentationDataset, get_dataset, ) -from tests.fixtures import * +from tests.fixtures import * # noqa: F403 def generic_dataset_test(ds, n, size): @@ -55,7 +55,7 @@ def test_should_correctly_create_a_instance_seg_dataset( ds = InstanceSegmentationDataset(dataset_path=root, release_name="latest") generic_dataset_test(ds, n=20, size=(50, 50)) - assert type(ds[0][1]) is dict + assert isinstance(ds[0][1], dict) class TestSemanticSegmentationDataset: @@ -66,7 +66,7 @@ def test_should_correctly_create_a_semantic_seg_dataset( ds = SemanticSegmentationDataset(dataset_path=root, release_name="latest") generic_dataset_test(ds, n=20, size=(50, 50)) - assert type(ds[0][1]) is dict + assert isinstance(ds[0][1], dict) class TestObjectDetectionDataset: @@ -77,7 +77,7 @@ def test_should_correctly_create_a_object_detection_dataset( ds = ObjectDetectionDataset(dataset_path=root, release_name="latest") generic_dataset_test(ds, n=20, size=(50, 50)) - assert type(ds[0][1]) is dict + assert isinstance(ds[0][1], dict) img, target = ds[0] for bbox in target["boxes"]: @@ -95,18 +95,25 @@ def v1_or_v2_slug(request): class TestGetDataset: - def test_exits_when_dataset_not_supported(self, v1_or_v2_slug: str, local_config_file: Config) -> None: + def test_exits_when_dataset_not_supported( + self, v1_or_v2_slug: str, local_config_file: Config + ) -> None: with patch.object(sys, "exit") as exception: get_dataset(f"{v1_or_v2_slug}/test", "unknown") exception.assert_called_once_with(1) - def test_exits_when_dataset_does_not_exist_locally(self, v1_or_v2_slug: str, local_config_file: Config) -> None: + def test_exits_when_dataset_does_not_exist_locally( + self, v1_or_v2_slug: str, local_config_file: Config + ) -> None: with patch.object(sys, "exit") as exception: get_dataset(f"{v1_or_v2_slug}/test", "classification") exception.assert_called_once_with(1) def test_loads_classification_dataset( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/sl", "classification") assert isinstance(dataset, ClassificationDataset) @@ -117,7 +124,10 @@ def test_loads_classification_dataset( assert label.item() == 0 def test_loads_multi_label_classification_dataset( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/ml", "classification") assert isinstance(dataset, ClassificationDataset) @@ -129,7 +139,10 @@ def test_loads_multi_label_classification_dataset( assert _maybe_tensor_to_list(label) == [1, 0, 1] def test_loads_object_detection_dataset_from_bounding_box_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/bb", "object-detection") assert isinstance(dataset, ObjectDetectionDataset) @@ -141,7 +154,9 @@ def test_loads_object_detection_dataset_from_bounding_box_annotations( label = {k: v.numpy().tolist() for k, v in label.items()} assert label == { - "boxes": [[4, 33, 17, 16]], # we need to account for xywh format and clamping + "boxes": [ + [4, 33, 17, 16] + ], # we need to account for xywh format and clamping "area": [612], "labels": [1], "image_id": [0], @@ -149,7 +164,10 @@ def test_loads_object_detection_dataset_from_bounding_box_annotations( } def test_loads_object_detection_dataset_from_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/coco", "object-detection") assert isinstance(dataset, ObjectDetectionDataset) @@ -160,7 +178,9 @@ def test_loads_object_detection_dataset_from_polygon_annotations( label = {k: v.numpy().tolist() for k, v in label.items()} assert label == { - "boxes": [[4, 33, 17, 16]], # we need to account for xywh format and clamping + "boxes": [ + [4, 33, 17, 16] + ], # we need to account for xywh format and clamping "area": [612], "labels": [1], "image_id": [0], @@ -168,7 +188,10 @@ def test_loads_object_detection_dataset_from_polygon_annotations( } def test_loads_object_detection_dataset_from_complex_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/complex_polygons", "object-detection") assert isinstance(dataset, ObjectDetectionDataset) @@ -187,7 +210,10 @@ def test_loads_object_detection_dataset_from_complex_polygon_annotations( } def test_loads_instance_segmentation_dataset_from_bounding_box_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: # You can load an instance segmentation dataset from an export that only has bounding boxes. # But it will ignore all the annotations, so you'll end up with 0 annotations. @@ -210,7 +236,10 @@ def test_loads_instance_segmentation_dataset_from_bounding_box_annotations( assert label["width"] == 50 def test_loads_instance_segmentation_dataset_from_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/coco", "instance-segmentation") assert isinstance(dataset, InstanceSegmentationDataset) @@ -231,9 +260,14 @@ def test_loads_instance_segmentation_dataset_from_polygon_annotations( assert label["width"] == 50 def test_loads_instance_segmentation_dataset_from_complex_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: - dataset = get_dataset(f"{v1_or_v2_slug}/complex_polygons", "instance-segmentation") + dataset = get_dataset( + f"{v1_or_v2_slug}/complex_polygons", "instance-segmentation" + ) assert isinstance(dataset, InstanceSegmentationDataset) assert len(dataset) == 1 @@ -252,7 +286,10 @@ def test_loads_instance_segmentation_dataset_from_complex_polygon_annotations( assert label["width"] == 50 def test_loads_semantic_segmentation_dataset_from_polygon_annotations( - self, v1_or_v2_slug: str, local_config_file: Config, team_extracted_dataset_path: Path + self, + v1_or_v2_slug: str, + local_config_file: Config, + team_extracted_dataset_path: Path, ) -> None: dataset = get_dataset(f"{v1_or_v2_slug}/coco", "semantic-segmentation") assert isinstance(dataset, SemanticSegmentationDataset) @@ -265,7 +302,7 @@ def test_loads_semantic_segmentation_dataset_from_polygon_annotations( label = {k: _maybe_tensor_to_list(v) for k, v in label.items()} assert label["image_id"] == [0] - assert type(label["mask"][0]) == list + assert isinstance(label["mask"][0], list) assert label["height"] == 50 assert label["width"] == 50