diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index d79ecae76..5610caeb4 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -8,6 +8,7 @@ from darwin.dataset.utils import get_classes, get_release_path, load_pil_image from darwin.utils import ( SUPPORTED_IMAGE_EXTENSIONS, + get_darwin_json_version, get_image_path_from_stream, parse_darwin_json, stream_darwin_json, @@ -131,10 +132,24 @@ def _setup_annotations_and_images( partition, split_type, ): + # Determine if the release is V1 or V2 JSON + json_version = get_darwin_json_version(annotations_dir) + + # + annotation_files = list(annotations_dir.glob("**/*.json")) + + for annotation_file in annotation_files: + with open(annotation_file, "r") as file: + data_str = file.read() + print(data_str) + # Find all the annotations and their corresponding images + with_folders = any([item.is_dir() for item in images_dir.iterdir()]) for annotation_path in sorted(annotations_dir.glob("**/*.json")): darwin_json = stream_darwin_json(annotation_path) - image_path = get_image_path_from_stream(darwin_json, images_dir) + image_path = get_image_path_from_stream( + darwin_json, images_dir, with_folders, json_version, annotation_path + ) if image_path.exists(): self.images_path.append(image_path) self.annotations_path.append(annotation_path) diff --git a/darwin/dataset/utils.py b/darwin/dataset/utils.py index 7aa3fa1bc..e4d43c180 100644 --- a/darwin/dataset/utils.py +++ b/darwin/dataset/utils.py @@ -17,6 +17,7 @@ SUPPORTED_EXTENSIONS, SUPPORTED_VIDEO_EXTENSIONS, attempt_decode, + get_darwin_json_version, get_image_path_from_stream, is_unix_like_os, parse_darwin_json, @@ -568,12 +569,20 @@ def _map_annotations_to_images( Raises: ValueError: If there are inconsistencies with the annotations and images. """ + images_paths = [] annotations_paths = [] invalid_annotation_paths = [] + + # Determine if the release is V1 or V2 JSON + json_version = get_darwin_json_version(annotations_dir) + + with_folders = any([item.is_dir() for item in images_dir.iterdir()]) for annotation_path in annotations_dir.glob("**/*.json"): darwin_json = stream_darwin_json(annotation_path) - image_path = get_image_path_from_stream(darwin_json, images_dir) + image_path = get_image_path_from_stream( + darwin_json, images_dir, with_folders, json_version, annotation_path + ) if image_path.exists(): images_paths.append(image_path) annotations_paths.append(annotation_path) diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index fe464aa62..c4e34b9ab 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -493,10 +493,15 @@ def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject: def get_image_path_from_stream( - darwin_json: PersistentStreamingJSONObject, images_dir: Path + darwin_json: PersistentStreamingJSONObject, + images_dir: Path, + with_folders: bool, + json_version: str, + annotation_path: Path, ) -> Path: """ - Returns the path to the image file associated with the given darwin json file (V1 or V2). + Returns the path to the image file associated with the given darwin json file. + Compatible with V1 & V2 Darwin JSON, as well as releases in folders and flat structures. Parameters ---------- @@ -504,6 +509,10 @@ def get_image_path_from_stream( A stream of the JSON file. images_dir : Path Path to the directory containing the images. + with_folders: bool + Flag to determine if the release was pulled with or without folders. + json_version: str + String representing the version of the Darwin JSON Returns ------- @@ -511,17 +520,51 @@ def get_image_path_from_stream( Path to the image file. """ try: - return ( - images_dir - / (Path(darwin_json["item"]["path"].lstrip("/\\"))) - / Path(darwin_json["item"]["name"]) - ) - except KeyError: - return ( - images_dir - / (Path(darwin_json["image"]["path"].lstrip("/\\"))) - / Path(darwin_json["image"]["filename"]) - ) + if json_version == "2.0": + if not with_folders: + return images_dir / Path(darwin_json["item"]["name"]) + else: + return ( + images_dir + / (Path(darwin_json["item"]["path"].lstrip("/\\"))) + / Path(darwin_json["item"]["name"]) + ) + else: + if not with_folders: + return images_dir / Path(darwin_json["image"]["filename"]) + else: + return ( + images_dir + / (Path(darwin_json["image"]["path"].lstrip("/\\"))) + / Path(darwin_json["image"]["filename"]) + ) + except OSError as e: + # Load in the JSON as normal + darwin_json = parse_darwin_json(path=annotation_path) + if not with_folders: + return images_dir / Path(darwin_json.filename) + else: + return images_dir / Path(darwin_json.full_path.lstrip("/\\")) + + +def get_darwin_json_version(annotations_dir: Path) -> str: + """ + Returns true is the input Darwin JSON file is 2.0, and False if 1.0. + + Parameters + ---------- + annotations_dir : Path + Path to the directory containing the annotation files. + + Returns + ------- + str + A str representing the Darwin JSON version. + """ + with open(next(annotations_dir.glob("*.json")), "r") as file: + data_str = file.read() + data = json.loads(data_str) + return "2.0" if "version" in data and data["version"] == "2.0" else "1.0" def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: @@ -533,9 +576,9 @@ def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( data ) - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) if len(slots) == 0: annotation_file = dt.AnnotationFile( @@ -612,9 +655,9 @@ def _parse_darwin_image( annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( data ) - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) slot = dt.Slot( name=None, @@ -657,9 +700,9 @@ def _parse_darwin_video( annotations: List[Union[dt.Annotation, dt.VideoAnnotation]] = _data_to_annotations( data ) - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) if "width" not in data["image"] or "height" not in data["image"]: raise OutdatedDarwinJSONFormat( @@ -1007,9 +1050,9 @@ def split_video_annotation(annotation: dt.AnnotationFile) -> List[dt.AnnotationF for a in annotation.annotations if isinstance(a, dt.VideoAnnotation) and i in a.frames ] - annotation_classes: Set[dt.AnnotationClass] = { - annotation.annotation_class for annotation in annotations - } + annotation_classes: Set[dt.AnnotationClass] = set( + [annotation.annotation_class for annotation in annotations] + ) filename: str = f"{Path(annotation.filename).stem}/{i:07d}.png" frame_annotations.append( dt.AnnotationFile(