From db229fc3523b74acf1e31be2c15b0c4841c98f95 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 27 Oct 2023 20:08:22 +0100 Subject: [PATCH 1/6] Fix for pulling releases containing folders but in a flat structure (WIP) --- darwin/dataset/local_dataset.py | 5 +- darwin/dataset/utils.py | 9 +- darwin/utils/utils.py | 320 +++++++++++++++++++++++++------- 3 files changed, 258 insertions(+), 76 deletions(-) diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index d79ecae76..23aa6f87c 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -132,9 +132,12 @@ def _setup_annotations_and_images( split_type, ): # Find all the annotations and their corresponding images + with_folders = any([item.is_dir() for item in images_dir.iterdir()]) for annotation_path in sorted(annotations_dir.glob("**/*.json")): darwin_json = stream_darwin_json(annotation_path) - image_path = get_image_path_from_stream(darwin_json, images_dir) + image_path = get_image_path_from_stream( + darwin_json, images_dir, with_folders + ) if image_path.exists(): self.images_path.append(image_path) self.annotations_path.append(annotation_path) diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index c11e3576a..5d295e3e3 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -230,7 +230,7 @@ def exhaust_generator( Exhausts the generator passed as parameter. Can be done multi threaded if desired. Creates and returns a coco record from the given annotation. - + Uses ``BoxMode.XYXY_ABS`` from ``detectron2.structures`` if available, defaults to ``box_mode = 0`` otherwise. Parameters @@ -571,9 +571,10 @@ def _map_annotations_to_images( images_paths = [] annotations_paths = [] invalid_annotation_paths = [] + with_folders = any([item.is_dir() for item in images_dir.iterdir()]) for annotation_path in annotations_dir.glob("**/*.json"): darwin_json = stream_darwin_json(annotation_path) - image_path = get_image_path_from_stream(darwin_json, images_dir) + image_path = get_image_path_from_stream(darwin_json, images_dir, with_folders) if image_path.exists(): images_paths.append(image_path) annotations_paths.append(annotation_path) @@ -583,7 +584,9 @@ def _map_annotations_to_images( invalid_annotation_paths.append(annotation_path) continue else: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") + raise ValueError( + f"Annotation ({annotation_path}) does not have a corresponding image" + ) return images_paths, annotations_paths, invalid_annotation_paths diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index 68f81a53f..4934b1c9b 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -216,7 +216,9 @@ def is_project_dir(project_path: Path) -> bool: return (project_path / "releases").exists() and (project_path / "images").exists() -def get_progress_bar(array: List[dt.AnnotationFile], description: Optional[str] = None) -> Iterable[ProgressType]: +def get_progress_bar( + array: List[dt.AnnotationFile], description: Optional[str] = None +) -> Iterable[ProgressType]: """ Get a rich a progress bar for the given list of annotation files. @@ -265,7 +267,10 @@ def prompt(msg: str, default: Optional[str] = None) -> str: def find_files( - files: List[dt.PathLike], *, files_to_exclude: List[dt.PathLike] = [], recursive: bool = True + files: List[dt.PathLike], + *, + files_to_exclude: List[dt.PathLike] = [], + recursive: bool = True, ) -> List[Path]: """ Retrieve a list of all files belonging to supported extensions. The exploration can be made @@ -322,7 +327,9 @@ def secure_continue_request() -> bool: def persist_client_configuration( - client: "Client", default_team: Optional[str] = None, config_path: Optional[Path] = None + client: "Client", + default_team: Optional[str] = None, + config_path: Optional[Path] = None, ) -> Config: """ Authenticate user against the server and creates a configuration file for him/her. @@ -350,8 +357,14 @@ def persist_client_configuration( raise ValueError("Unable to get default team.") config: Config = Config(config_path) - config.set_team(team=team_config.slug, api_key=team_config.api_key, datasets_dir=team_config.datasets_dir) - config.set_global(api_endpoint=client.url, base_url=client.base_url, default_team=default_team) + config.set_team( + team=team_config.slug, + api_key=team_config.api_key, + datasets_dir=team_config.datasets_dir, + ) + config.set_global( + api_endpoint=client.url, base_url=client.base_url, default_team=default_team + ) return config @@ -408,7 +421,9 @@ def attempt_decode(path: Path) -> dict: return data except Exception: continue - raise UnrecognizableFileEncoding(f"Unable to load file {path} with any encodings: {encodings}") + raise UnrecognizableFileEncoding( + f"Unable to load file {path} with any encodings: {encodings}" + ) def load_data_from_file(path: Path) -> Tuple[dict, dt.AnnotationFileVersion]: @@ -417,7 +432,9 @@ def load_data_from_file(path: Path) -> Tuple[dict, dt.AnnotationFileVersion]: return data, version -def parse_darwin_json(path: Path, count: Optional[int] = None) -> Optional[dt.AnnotationFile]: +def parse_darwin_json( + path: Path, count: Optional[int] = None +) -> Optional[dt.AnnotationFile]: """ Parses the given JSON file in v7's darwin proprietary format. Works for images, split frame videos (treated as images) and playback videos. @@ -456,6 +473,7 @@ def parse_darwin_json(path: Path, count: Optional[int] = None) -> Optional[dt.An else: return _parse_darwin_image(path, data, count) + def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject: """ Returns a Darwin JSON file as a persistent stream. This allows for parsing large files without @@ -474,8 +492,11 @@ def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject: with path.open() as infile: return json_stream.load(infile, persistent=True) - -def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, images_dir: Path) -> Path: + + +def get_image_path_from_stream( + darwin_json: PersistentStreamingJSONObject, images_dir: Path, with_folders: bool +) -> Path: """ Returns the path to the image file associated with the given darwin json file (V1 or V2). @@ -485,23 +506,46 @@ def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, image A stream of the JSON file. images_dir : Path Path to the directory containing the images. + with_folders: Bool + Flag to determine if the release was pulled with or without folders. Returns ------- Path Path to the image file. """ - try: - return images_dir / (Path(darwin_json['item']['path'].lstrip('/\\'))) / Path(darwin_json['item']['name']) - except KeyError: - return images_dir / (Path(darwin_json['image']['path'].lstrip('/\\'))) / Path(darwin_json['image']['filename']) + if not with_folders: + try: + return images_dir / Path(darwin_json["item"]["name"]) + except KeyError: + return images_dir / Path(darwin_json["image"]["filename"]) + else: + try: + return ( + images_dir + / (Path(darwin_json["item"]["path"].lstrip("/\\"))) + / Path(darwin_json["item"]["name"]) + ) + except KeyError: + return ( + images_dir + / (Path(darwin_json["image"]["path"].lstrip("/\\"))) + / Path(darwin_json["image"]["filename"]) + ) + def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: item = data["item"] item_source = item.get("source_info", {}) - slots: List[dt.Slot] = list(filter(None, map(_parse_darwin_slot, item.get("slots", [])))) - annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations(data) - annotation_classes: Set[dt.AnnotationClass] = set([annotation.annotation_class for annotation in annotations]) + slots: List[dt.Slot] = list( + filter(None, map(_parse_darwin_slot, item.get("slots", []))) + ) + annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( + data + ) + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) if len(slots) == 0: annotation_file = dt.AnnotationFile( @@ -509,7 +553,9 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: path=path, filename=item["name"], item_id=item.get("source_info", {}).get("item_id", None), - dataset_name=item.get("source_info", {}).get("dataset", {}).get("name", None), + dataset_name=item.get("source_info", {}) + .get("dataset", {}) + .get("name", None), annotation_classes=annotation_classes, annotations=annotations, is_video=False, @@ -530,13 +576,17 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: path=path, filename=item["name"], item_id=item.get("source_info", {}).get("item_id", None), - dataset_name=item.get("source_info", {}).get("dataset", {}).get("name", None), + dataset_name=item.get("source_info", {}) + .get("dataset", {}) + .get("name", None), annotation_classes=annotation_classes, annotations=annotations, is_video=slot.frame_urls is not None, image_width=slot.width, image_height=slot.height, - image_url=None if len(slot.source_files or []) == 0 else slot.source_files[0]["url"], + image_url=None + if len(slot.source_files or []) == 0 + else slot.source_files[0]["url"], image_thumbnail_url=slot.thumbnail_url, workview_url=item_source.get("workview_url", None), seq=0, @@ -565,14 +615,25 @@ def _parse_darwin_slot(data: Dict[str, Any]) -> dt.Slot: ) -def _parse_darwin_image(path: Path, data: Dict[str, Any], count: Optional[int]) -> dt.AnnotationFile: - annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations(data) - annotation_classes: Set[dt.AnnotationClass] = set([annotation.annotation_class for annotation in annotations]) +def _parse_darwin_image( + path: Path, data: Dict[str, Any], count: Optional[int] +) -> dt.AnnotationFile: + annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( + data + ) + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) slot = dt.Slot( name=None, type="image", - source_files=[{"url": data["image"].get("url"), "file_name": _get_local_filename(data["image"])}], + source_files=[ + { + "url": data["image"].get("url"), + "file_name": _get_local_filename(data["image"]), + } + ], thumbnail_url=data["image"].get("thumbnail_url"), width=data["image"].get("width"), height=data["image"].get("height"), @@ -599,17 +660,30 @@ def _parse_darwin_image(path: Path, data: Dict[str, Any], count: Optional[int]) return annotation_file -def _parse_darwin_video(path: Path, data: Dict[str, Any], count: Optional[int]) -> dt.AnnotationFile: - annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations(data) - annotation_classes: Set[dt.AnnotationClass] = set([annotation.annotation_class for annotation in annotations]) +def _parse_darwin_video( + path: Path, data: Dict[str, Any], count: Optional[int] +) -> dt.AnnotationFile: + annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( + data + ) + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) if "width" not in data["image"] or "height" not in data["image"]: - raise OutdatedDarwinJSONFormat("Missing width/height in video, please re-export") + raise OutdatedDarwinJSONFormat( + "Missing width/height in video, please re-export" + ) slot = dt.Slot( name=None, type="video", - source_files=[{"url": data["image"].get("url"), "file_name": _get_local_filename(data["image"])}], + source_files=[ + { + "url": data["image"].get("url"), + "file_name": _get_local_filename(data["image"]), + } + ], thumbnail_url=data["image"].get("thumbnail_url"), width=data["image"].get("width"), height=data["image"].get("height"), @@ -645,23 +719,41 @@ def _parse_darwin_annotation(annotation: Dict[str, Any]) -> Optional[dt.Annotati main_annotation: Optional[dt.Annotation] = None # Darwin JSON 2.0 representation of complex polygons - if "polygon" in annotation and "paths" in annotation["polygon"] and len(annotation["polygon"]["paths"]) > 1: + if ( + "polygon" in annotation + and "paths" in annotation["polygon"] + and len(annotation["polygon"]["paths"]) > 1 + ): bounding_box = annotation.get("bounding_box") paths = annotation["polygon"]["paths"] - main_annotation = dt.make_complex_polygon(name, paths, bounding_box, slot_names=slot_names) + main_annotation = dt.make_complex_polygon( + name, paths, bounding_box, slot_names=slot_names + ) # Darwin JSON 2.0 representation of simple polygons - elif "polygon" in annotation and "paths" in annotation["polygon"] and len(annotation["polygon"]["paths"]) == 1: + elif ( + "polygon" in annotation + and "paths" in annotation["polygon"] + and len(annotation["polygon"]["paths"]) == 1 + ): bounding_box = annotation.get("bounding_box") paths = annotation["polygon"]["paths"] - main_annotation = dt.make_polygon(name, paths[0], bounding_box, slot_names=slot_names) + main_annotation = dt.make_polygon( + name, paths[0], bounding_box, slot_names=slot_names + ) # Darwin JSON 1.0 representation of complex and simple polygons elif "polygon" in annotation: bounding_box = annotation.get("bounding_box") if "additional_paths" in annotation["polygon"]: - paths = [annotation["polygon"]["path"]] + annotation["polygon"]["additional_paths"] - main_annotation = dt.make_complex_polygon(name, paths, bounding_box, slot_names=slot_names) + paths = [annotation["polygon"]["path"]] + annotation["polygon"][ + "additional_paths" + ] + main_annotation = dt.make_complex_polygon( + name, paths, bounding_box, slot_names=slot_names + ) else: - main_annotation = dt.make_polygon(name, annotation["polygon"]["path"], bounding_box, slot_names=slot_names) + main_annotation = dt.make_polygon( + name, annotation["polygon"]["path"], bounding_box, slot_names=slot_names + ) # Darwin JSON 1.0 representation of complex polygons elif "complex_polygon" in annotation: bounding_box = annotation.get("bounding_box") @@ -673,42 +765,72 @@ def _parse_darwin_annotation(annotation: Dict[str, Any]) -> Optional[dt.Annotati if "additional_paths" in annotation["complex_polygon"]: paths.extend(annotation["complex_polygon"]["additional_paths"]) - main_annotation = dt.make_complex_polygon(name, paths, bounding_box, slot_names=slot_names) + main_annotation = dt.make_complex_polygon( + name, paths, bounding_box, slot_names=slot_names + ) elif "bounding_box" in annotation: bounding_box = annotation["bounding_box"] main_annotation = dt.make_bounding_box( - name, bounding_box["x"], bounding_box["y"], bounding_box["w"], bounding_box["h"], slot_names=slot_names + name, + bounding_box["x"], + bounding_box["y"], + bounding_box["w"], + bounding_box["h"], + slot_names=slot_names, ) elif "tag" in annotation: main_annotation = dt.make_tag(name, slot_names=slot_names) elif "line" in annotation: - main_annotation = dt.make_line(name, annotation["line"]["path"], slot_names=slot_names) + main_annotation = dt.make_line( + name, annotation["line"]["path"], slot_names=slot_names + ) elif "keypoint" in annotation: main_annotation = dt.make_keypoint( - name, annotation["keypoint"]["x"], annotation["keypoint"]["y"], slot_names=slot_names + name, + annotation["keypoint"]["x"], + annotation["keypoint"]["y"], + slot_names=slot_names, ) elif "ellipse" in annotation: - main_annotation = dt.make_ellipse(name, annotation["ellipse"], slot_names=slot_names) + main_annotation = dt.make_ellipse( + name, annotation["ellipse"], slot_names=slot_names + ) elif "cuboid" in annotation: - main_annotation = dt.make_cuboid(name, annotation["cuboid"], slot_names=slot_names) + main_annotation = dt.make_cuboid( + name, annotation["cuboid"], slot_names=slot_names + ) elif "skeleton" in annotation: - main_annotation = dt.make_skeleton(name, annotation["skeleton"]["nodes"], slot_names=slot_names) + main_annotation = dt.make_skeleton( + name, annotation["skeleton"]["nodes"], slot_names=slot_names + ) elif "table" in annotation: main_annotation = dt.make_table( - name, annotation["table"]["bounding_box"], annotation["table"]["cells"], slot_names=slot_names + name, + annotation["table"]["bounding_box"], + annotation["table"]["cells"], + slot_names=slot_names, ) elif "string" in annotation: - main_annotation = dt.make_string(name, annotation["string"]["sources"], slot_names=slot_names) + main_annotation = dt.make_string( + name, annotation["string"]["sources"], slot_names=slot_names + ) elif "graph" in annotation: main_annotation = dt.make_graph( - name, annotation["graph"]["nodes"], annotation["graph"]["edges"], slot_names=slot_names + name, + annotation["graph"]["nodes"], + annotation["graph"]["edges"], + slot_names=slot_names, ) elif "mask" in annotation: main_annotation = dt.make_mask(name, slot_names=slot_names) elif "raster_layer" in annotation: raster_layer = annotation["raster_layer"] main_annotation = dt.make_raster_layer( - name, raster_layer["mask_annotation_ids_mapping"], raster_layer["total_pixels"], raster_layer["dense_rle"], slot_names=slot_names + name, + raster_layer["mask_annotation_ids_mapping"], + raster_layer["total_pixels"], + raster_layer["dense_rle"], + slot_names=slot_names, ) if not main_annotation: @@ -718,19 +840,29 @@ def _parse_darwin_annotation(annotation: Dict[str, Any]) -> Optional[dt.Annotati if "id" in annotation: main_annotation.id = annotation["id"] if "instance_id" in annotation: - main_annotation.subs.append(dt.make_instance_id(annotation["instance_id"]["value"])) + main_annotation.subs.append( + dt.make_instance_id(annotation["instance_id"]["value"]) + ) if "attributes" in annotation: main_annotation.subs.append(dt.make_attributes(annotation["attributes"])) if "text" in annotation: main_annotation.subs.append(dt.make_text(annotation["text"]["text"])) if "inference" in annotation: - main_annotation.subs.append(dt.make_opaque_sub("inference", annotation["inference"])) + main_annotation.subs.append( + dt.make_opaque_sub("inference", annotation["inference"]) + ) if "directional_vector" in annotation: - main_annotation.subs.append(dt.make_opaque_sub("directional_vector", annotation["directional_vector"])) + main_annotation.subs.append( + dt.make_opaque_sub("directional_vector", annotation["directional_vector"]) + ) if "measures" in annotation: - main_annotation.subs.append(dt.make_opaque_sub("measures", annotation["measures"])) + main_annotation.subs.append( + dt.make_opaque_sub("measures", annotation["measures"]) + ) if "auto_annotate" in annotation: - main_annotation.subs.append(dt.make_opaque_sub("auto_annotate", annotation["auto_annotate"])) + main_annotation.subs.append( + dt.make_opaque_sub("auto_annotate", annotation["auto_annotate"]) + ) if annotation.get("annotators") is not None: main_annotation.annotators = _parse_annotators(annotation["annotators"]) @@ -784,7 +916,9 @@ def _parse_darwin_raster_annotation(annotation: dict) -> Optional[dt.Annotation] slot_names: Optional[List[str]] = parse_slot_names(annotation) if not id or not name or not raster_layer: - raise ValueError("Raster annotation must have an 'id', 'name' and 'raster_layer' field") + raise ValueError( + "Raster annotation must have an 'id', 'name' and 'raster_layer' field" + ) dense_rle, mask_annotation_ids_mapping, total_pixels = ( raster_layer.get("dense_rle", None), @@ -835,9 +969,14 @@ def _parse_darwin_mask_annotation(annotation: dict) -> Optional[dt.Annotation]: def _parse_annotators(annotators: List[Dict[str, Any]]) -> List[dt.AnnotationAuthor]: if not (hasattr(annotators, "full_name") or not hasattr(annotators, "email")): - raise AttributeError("JSON file must contain annotators with 'full_name' and 'email' fields") + raise AttributeError( + "JSON file must contain annotators with 'full_name' and 'email' fields" + ) - return [dt.AnnotationAuthor(annotator["full_name"], annotator["email"]) for annotator in annotators] + return [ + dt.AnnotationAuthor(annotator["full_name"], annotator["email"]) + for annotator in annotators + ] def split_video_annotation(annotation: dt.AnnotationFile) -> List[dt.AnnotationFile]: @@ -870,9 +1009,13 @@ def split_video_annotation(annotation: dt.AnnotationFile) -> List[dt.AnnotationF frame_annotations = [] for i, frame_url in enumerate(annotation.frame_urls): annotations = [ - a.frames[i] for a in annotation.annotations if isinstance(a, dt.VideoAnnotation) and i in a.frames + a.frames[i] + for a in annotation.annotations + if isinstance(a, dt.VideoAnnotation) and i in a.frames ] - annotation_classes: Set[dt.AnnotationClass] = set([annotation.annotation_class for annotation in annotations]) + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) filename: str = f"{Path(annotation.filename).stem}/{i:07d}.png" frame_annotations.append( dt.AnnotationFile( @@ -956,7 +1099,9 @@ def convert_polygons_to_sequences( else: list_polygons = cast(List[dt.Polygon], [polygons]) - if not isinstance(list_polygons[0], list) or not isinstance(list_polygons[0][0], dict): + if not isinstance(list_polygons[0], list) or not isinstance( + list_polygons[0][0], dict + ): raise ValueError("Unknown input format") sequences: List[List[Union[int, float]]] = [] @@ -964,8 +1109,8 @@ def convert_polygons_to_sequences( path: List[Union[int, float]] = [] for point in polygon: # Clip coordinates to the image size - x = max(min(point["x"], width -1) if width else point["x"], 0) - y = max(min(point["y"], height -1) if height else point["y"], 0) + x = max(min(point["x"], width - 1) if width else point["x"], 0) + y = max(min(point["y"], height - 1) if height else point["y"], 0) if rounding: path.append(round(x)) path.append(round(y)) @@ -983,7 +1128,9 @@ def convert_polygons_to_sequences( details="Do not use.", ) def convert_sequences_to_polygons( - sequences: List[Union[List[int], List[float]]], height: Optional[int] = None, width: Optional[int] = None + sequences: List[Union[List[int], List[float]]], + height: Optional[int] = None, + width: Optional[int] = None, ) -> Dict[str, List[dt.Polygon]]: """ Converts a list of polygons, encoded as a list of dictionaries of into a list of nd.arrays @@ -1095,7 +1242,9 @@ def convert_bounding_box_to_xyxy(box: dt.BoundingBox) -> List[float]: return [box["x"], box["y"], x2, y2] -def convert_polygons_to_mask(polygons: List, height: int, width: int, value: Optional[int] = 1) -> np.ndarray: +def convert_polygons_to_mask( + polygons: List, height: int, width: int, value: Optional[int] = 1 +) -> np.ndarray: """ Converts a list of polygons, encoded as a list of dictionaries into an ``nd.array`` mask. @@ -1139,7 +1288,7 @@ def chunk(items: List[Any], size: int) -> Iterator[Any]: A chunk of the of the given size. """ for i in range(0, len(items), size): - yield items[i:i + size] + yield items[i : i + size] def is_unix_like_os() -> bool: @@ -1189,31 +1338,58 @@ def _parse_version(data: dict) -> dt.AnnotationFileVersion: return dt.AnnotationFileVersion(int(major), int(minor), suffix) -def _data_to_annotations(data: Dict[str, Any]) -> List[Union[dt.Annotation, dt.VideoAnnotation]]: +def _data_to_annotations( + data: Dict[str, Any] +) -> List[Union[dt.Annotation, dt.VideoAnnotation]]: raw_image_annotations = filter( lambda annotation: ( - ("frames" not in annotation) and ("raster_layer" not in annotation) and ("mask" not in annotation) + ("frames" not in annotation) + and ("raster_layer" not in annotation) + and ("mask" not in annotation) ), data["annotations"], ) - raw_video_annotations = filter(lambda annotation: "frames" in annotation, data["annotations"]) - raw_raster_annotations = filter(lambda annotation: "raster_layer" in annotation, data["annotations"]) - raw_mask_annotations = filter(lambda annotation: "mask" in annotation, data["annotations"]) - image_annotations: List[dt.Annotation] = list(filter(None, map(_parse_darwin_annotation, raw_image_annotations))) + raw_video_annotations = filter( + lambda annotation: "frames" in annotation, data["annotations"] + ) + raw_raster_annotations = filter( + lambda annotation: "raster_layer" in annotation, data["annotations"] + ) + raw_mask_annotations = filter( + lambda annotation: "mask" in annotation, data["annotations"] + ) + image_annotations: List[dt.Annotation] = list( + filter(None, map(_parse_darwin_annotation, raw_image_annotations)) + ) video_annotations: List[dt.VideoAnnotation] = list( filter(None, map(_parse_darwin_video_annotation, raw_video_annotations)) ) raster_annotations: List[dt.Annotation] = list( filter(None, map(_parse_darwin_raster_annotation, raw_raster_annotations)) ) - mask_annotations: List[dt.Annotation] = list(filter(None, map(_parse_darwin_mask_annotation, raw_mask_annotations))) + mask_annotations: List[dt.Annotation] = list( + filter(None, map(_parse_darwin_mask_annotation, raw_mask_annotations)) + ) - return [*image_annotations, *video_annotations, *raster_annotations, *mask_annotations] + return [ + *image_annotations, + *video_annotations, + *raster_annotations, + *mask_annotations, + ] def _supported_schema_versions() -> Dict[Tuple[int, int, str], str]: - return {(2, 0, ""): "https://darwin-public.s3.eu-west-1.amazonaws.com/darwin_json/2.0/schema.json"} + return { + ( + 2, + 0, + "", + ): "https://darwin-public.s3.eu-west-1.amazonaws.com/darwin_json/2.0/schema.json" + } def _default_schema(version: dt.AnnotationFileVersion) -> Optional[str]: - return _supported_schema_versions().get((version.major, version.minor, version.suffix)) + return _supported_schema_versions().get( + (version.major, version.minor, version.suffix) + ) From 9c9ed4e38b64e167161f7f6580a4e04882f18abc Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Sat, 28 Oct 2023 16:32:44 +0100 Subject: [PATCH 2/6] Fixing tests (WIP) --- darwin/dataset/local_dataset.py | 14 +++++++++- darwin/dataset/utils.py | 10 ++++++- darwin/utils/utils.py | 49 ++++++++++++++++++++++++++------- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index 23aa6f87c..fd3b82440 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -8,6 +8,7 @@ from darwin.dataset.utils import get_classes, get_release_path, load_pil_image from darwin.utils import ( SUPPORTED_IMAGE_EXTENSIONS, + get_darwin_json_version, get_image_path_from_stream, parse_darwin_json, stream_darwin_json, @@ -131,12 +132,23 @@ def _setup_annotations_and_images( partition, split_type, ): + # Determine if the release is V1 or V2 JSON + json_version = get_darwin_json_version(annotations_dir) + + # + annotation_files = list(annotations_dir.glob("**/*.json")) + + for annotation_file in annotation_files: + with open(annotation_file, "r") as file: + data_str = file.read() + print(data_str) + # Find all the annotations and their corresponding images with_folders = any([item.is_dir() for item in images_dir.iterdir()]) for annotation_path in sorted(annotations_dir.glob("**/*.json")): darwin_json = stream_darwin_json(annotation_path) image_path = get_image_path_from_stream( - darwin_json, images_dir, with_folders + darwin_json, images_dir, with_folders, json_version ) if image_path.exists(): self.images_path.append(image_path) diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index 5d295e3e3..2a74399e6 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -17,6 +17,7 @@ SUPPORTED_EXTENSIONS, SUPPORTED_VIDEO_EXTENSIONS, attempt_decode, + get_darwin_json_version, get_image_path_from_stream, is_unix_like_os, parse_darwin_json, @@ -568,13 +569,20 @@ def _map_annotations_to_images( Raises: ValueError: If there are inconsistencies with the annotations and images. """ + images_paths = [] annotations_paths = [] invalid_annotation_paths = [] + + # Determine if the release is V1 or V2 JSON + json_version = get_darwin_json_version(annotations_dir) + with_folders = any([item.is_dir() for item in images_dir.iterdir()]) for annotation_path in annotations_dir.glob("**/*.json"): darwin_json = stream_darwin_json(annotation_path) - image_path = get_image_path_from_stream(darwin_json, images_dir, with_folders) + image_path = get_image_path_from_stream( + darwin_json, images_dir, with_folders, json_version + ) if image_path.exists(): images_paths.append(image_path) annotations_paths.append(annotation_path) diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index 4934b1c9b..6cd83997b 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -495,10 +495,14 @@ def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject: def get_image_path_from_stream( - darwin_json: PersistentStreamingJSONObject, images_dir: Path, with_folders: bool + darwin_json: PersistentStreamingJSONObject, + images_dir: Path, + with_folders: bool, + json_version: str, ) -> Path: """ - Returns the path to the image file associated with the given darwin json file (V1 or V2). + Returns the path to the image file associated with the given darwin json file. + Compatible with V1 & V2 Darwin JSON, as well as releases in folders and flat structures. Parameters ---------- @@ -506,27 +510,32 @@ def get_image_path_from_stream( A stream of the JSON file. images_dir : Path Path to the directory containing the images. - with_folders: Bool + with_folders: bool Flag to determine if the release was pulled with or without folders. + json_version: str + String representing the version of the Darwin JSON Returns ------- Path Path to the image file. """ - if not with_folders: - try: + if json_version == "2.0": + if not with_folders: return images_dir / Path(darwin_json["item"]["name"]) - except KeyError: - return images_dir / Path(darwin_json["image"]["filename"]) - else: - try: + else: return ( images_dir / (Path(darwin_json["item"]["path"].lstrip("/\\"))) / Path(darwin_json["item"]["name"]) ) - except KeyError: + else: + if not with_folders: + try: + return images_dir / Path(darwin_json["image"]["filename"]) + except Exception: + pass + else: return ( images_dir / (Path(darwin_json["image"]["path"].lstrip("/\\"))) @@ -534,6 +543,26 @@ def get_image_path_from_stream( ) +def get_darwin_json_version(annotations_dir: Path) -> str: + """ + Returns true is the input Darwin JSON file is 2.0, and False if 1.0. + + Parameters + ---------- + annotations_dir : Path + Path to the directory containing the annotation files. + + Returns + ------- + str + A str representing the Darwin JSON version. + """ + with open(next(annotations_dir.glob("*.json")), "r") as file: + data_str = file.read() + data = json.loads(data_str) + return "2.0" if "version" in data and data["version"] == "2.0" else "1.0" + + def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: item = data["item"] item_source = item.get("source_info", {}) From 5a5d231e1066938f15b9031c67f35a8ac4536168 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 21 Nov 2023 09:50:37 +0000 Subject: [PATCH 3/6] WIP --- darwin/utils/utils.py | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index 9dad81636..70df00bd8 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -540,6 +540,8 @@ def get_image_path_from_stream( / Path(darwin_json["image"]["filename"]) ) + # WIP: Implementing this with regex instead of streaming + def get_darwin_json_version(annotations_dir: Path) -> str: """ @@ -570,15 +572,9 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( data ) -<<<<<<< HEAD annotation_classes: Set[dt.AnnotationClass] = set( [annotation.annotation_class for annotation in annotations] ) -======= - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } ->>>>>>> master if len(slots) == 0: annotation_file = dt.AnnotationFile( @@ -654,15 +650,9 @@ def _parse_darwin_image( annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( data ) -<<<<<<< HEAD annotation_classes: Set[dt.AnnotationClass] = set( [annotation.annotation_class for annotation in annotations] ) -======= - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } ->>>>>>> master slot = dt.Slot( name=None, @@ -705,15 +695,9 @@ def _parse_darwin_video( annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( data ) -<<<<<<< HEAD annotation_classes: Set[dt.AnnotationClass] = set( [annotation.annotation_class for annotation in annotations] ) -======= - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } ->>>>>>> master if "width" not in data["image"] or "height" not in data["image"]: raise OutdatedDarwinJSONFormat( @@ -1058,15 +1042,9 @@ def split_video_annotation(annotation: dt.AnnotationFile) -> List[dt.AnnotationF for a in annotation.annotations if isinstance(a, dt.VideoAnnotation) and i in a.frames ] -<<<<<<< HEAD annotation_classes: Set[dt.AnnotationClass] = set( [annotation.annotation_class for annotation in annotations] ) -======= - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } ->>>>>>> master filename: str = f"{Path(annotation.filename).stem}/{i:07d}.png" frame_annotations.append( dt.AnnotationFile( From 69a935e3cada8554e552d2fd245029bd8dbbb312 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 21 Nov 2023 11:09:01 +0000 Subject: [PATCH 4/6] Except JSON Streaming error & load JSON as normal --- darwin/dataset/local_dataset.py | 2 +- darwin/dataset/utils.py | 2 +- darwin/utils/utils.py | 44 ++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index fd3b82440..5610caeb4 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -148,7 +148,7 @@ def _setup_annotations_and_images( for annotation_path in sorted(annotations_dir.glob("**/*.json")): darwin_json = stream_darwin_json(annotation_path) image_path = get_image_path_from_stream( - darwin_json, images_dir, with_folders, json_version + darwin_json, images_dir, with_folders, json_version, annotation_path ) if image_path.exists(): self.images_path.append(image_path) diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index 2a74399e6..e4d43c180 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -581,7 +581,7 @@ def _map_annotations_to_images( for annotation_path in annotations_dir.glob("**/*.json"): darwin_json = stream_darwin_json(annotation_path) image_path = get_image_path_from_stream( - darwin_json, images_dir, with_folders, json_version + darwin_json, images_dir, with_folders, json_version, annotation_path ) if image_path.exists(): images_paths.append(image_path) diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index 70df00bd8..214ebdb5d 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -497,6 +497,7 @@ def get_image_path_from_stream( images_dir: Path, with_folders: bool, json_version: str, + annotation_path: Path, ) -> Path: """ Returns the path to the image file associated with the given darwin json file. @@ -518,29 +519,32 @@ def get_image_path_from_stream( Path Path to the image file. """ - if json_version == "2.0": - if not with_folders: - return images_dir / Path(darwin_json["item"]["name"]) + try: + if json_version == "2.0": + if not with_folders: + return images_dir / Path(darwin_json["item"]["name"]) + else: + return ( + images_dir + / (Path(darwin_json["item"]["path"].lstrip("/\\"))) + / Path(darwin_json["item"]["name"]) + ) else: - return ( - images_dir - / (Path(darwin_json["item"]["path"].lstrip("/\\"))) - / Path(darwin_json["item"]["name"]) - ) - else: - if not with_folders: - try: + if not with_folders: return images_dir / Path(darwin_json["image"]["filename"]) - except Exception: - pass + else: + return ( + images_dir + / (Path(darwin_json["image"]["path"].lstrip("/\\"))) + / Path(darwin_json["image"]["filename"]) + ) + except OSError as e: + # Load in the JSON as normal + darwin_json = parse_darwin_json(path=annotation_path) + if not with_folders: + return images_dir / Path(darwin_json.filename) else: - return ( - images_dir - / (Path(darwin_json["image"]["path"].lstrip("/\\"))) - / Path(darwin_json["image"]["filename"]) - ) - - # WIP: Implementing this with regex instead of streaming + return images_dir / Path(darwin_json.full_path.lstrip("/\\")) def get_darwin_json_version(annotations_dir: Path) -> str: From 84a5b050356d610b99118c81f0dbe7430185c407 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 21 Nov 2023 11:12:03 +0000 Subject: [PATCH 5/6] Fixed small typing bug in old darwin move_to_stage method --- darwin/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darwin/client.py b/darwin/client.py index b87bf371d..064c10180 100644 --- a/darwin/client.py +++ b/darwin/client.py @@ -908,7 +908,7 @@ def move_to_stage( dataset_slug: str, team_slug: str, filters: Dict[str, UnknownType], - stage_id: int, + stage_id: str, ) -> None: """ Moves the given items to the specified stage From 76db70fbab8f878528e90d9ca71e3c44deaef04b Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Mon, 11 Dec 2023 15:03:47 +0000 Subject: [PATCH 6/6] Undid change included in PY-641 --- darwin/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darwin/client.py b/darwin/client.py index 064c10180..b87bf371d 100644 --- a/darwin/client.py +++ b/darwin/client.py @@ -908,7 +908,7 @@ def move_to_stage( dataset_slug: str, team_slug: str, filters: Dict[str, UnknownType], - stage_id: str, + stage_id: int, ) -> None: """ Moves the given items to the specified stage