diff --git a/darwin/dataset/local_dataset.py b/darwin/dataset/local_dataset.py index 4e88cf0cc..b8c48a7ea 100644 --- a/darwin/dataset/local_dataset.py +++ b/darwin/dataset/local_dataset.py @@ -8,6 +8,7 @@ from darwin.dataset.utils import get_classes, get_release_path, load_pil_image from darwin.utils import ( SUPPORTED_IMAGE_EXTENSIONS, + get_darwin_json_version, get_image_path_from_stream, is_stream_list_empty, parse_darwin_json, @@ -133,18 +134,29 @@ def _setup_annotations_and_images( keep_empty_annotations: bool = False, ): # Find all the annotations and their corresponding images - for annotation_path in sorted(annotations_dir.glob("**/*.json")): - darwin_json = stream_darwin_json(annotation_path) + json_version = get_darwin_json_version(annotations_dir) - image_path = get_image_path_from_stream(darwin_json, images_dir) + with_folders = any([item.is_dir() for item in images_dir.iterdir()]) + annotation_filepaths = get_annotation_filepaths( + release_path, annotations_dir, annotation_type, split, partition, split_type + ) + + for annotation_filepath in annotation_filepaths: + annotation_filepath = Path(annotation_filepath) + darwin_json = stream_darwin_json(annotation_filepath) + image_path = get_image_path_from_stream( + darwin_json, images_dir, with_folders, json_version, annotation_filepath + ) if image_path.exists(): if not keep_empty_annotations and is_stream_list_empty(darwin_json["annotations"]): continue self.images_path.append(image_path) - self.annotations_path.append(annotation_path) + self.annotations_path.append(annotation_filepath) continue else: - raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image {image_path}") + raise ValueError( + f"Annotation ({annotation_filepath}) does not have a corresponding image" + ) def _initial_setup(self, dataset_path, release_name): assert dataset_path is not None @@ -419,7 +431,7 @@ def __str__(self): ) -def build_stems( +def get_annotation_filepaths( release_path: Path, annotations_dir: Path, annotation_type: str, @@ -428,7 +440,7 @@ def build_stems( split_type: str = "random", ) -> Iterator[str]: """ - Builds the stems for the given release with the given annotations as base. + Returns a list of annotation filepaths for the given release & partition. Parameters ---------- @@ -469,7 +481,7 @@ def build_stems( """ if partition is None: - return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json"))) + return (str(e) for e in sorted(annotations_dir.glob("**/*.json"))) if split_type == "random": split_filename = f"{split_type}_{partition}.txt" @@ -480,7 +492,7 @@ def build_stems( split_path = release_path / "lists" / split / split_filename if split_path.is_file(): - return (e.strip("\n\r") for e in split_path.open()) + return (line.strip("\n\r") for line in split_path.open()) raise FileNotFoundError( "could not find a dataset partition. " diff --git a/darwin/dataset/split_manager.py b/darwin/dataset/split_manager.py index df7416c40..47f49e81c 100644 --- a/darwin/dataset/split_manager.py +++ b/darwin/dataset/split_manager.py @@ -426,14 +426,8 @@ def _write_to_file( ) -> None: with open(str(file_path), "w") as f: for i in split_idx: - # To deal with recursive search, we want to write the difference between the annotation path - # and its parent, without the file extension - stem = ( - str(annotation_files[i]) - .replace(f"{annotation_path}/", "") - .rsplit(".json", 1)[0] - ) - f.write(f"{stem}\n") + annotation_filepath = annotation_files[i] + f.write(f"{annotation_filepath}\n") def _validate_split(val_percentage: float, test_percentage: float) -> None: diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index 590a4b69e..02ce6d3e8 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -482,20 +482,18 @@ def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject: with path.open() as infile: return json_stream.load(infile, persistent=True) - - -def is_stream_list_empty(json_list: PersistentStreamingJSONList) -> bool: - try: - json_list[0] - except IndexError: - return True - - return False - - -def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, images_dir: Path) -> Path: + + +def get_image_path_from_stream( + darwin_json: PersistentStreamingJSONObject, + images_dir: Path, + with_folders: bool, + json_version: str, + annotation_filepath: Path, +) -> Path: """ - Returns the path to the image file associated with the given darwin json file (V1 or V2). + Returns the path to the image file associated with the given darwin json file. + Compatible with V1 & V2 Darwin JSON, as well as releases in folders and flat structures. Parameters ---------- @@ -503,6 +501,10 @@ def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, image A stream of the JSON file. images_dir : Path Path to the directory containing the images. + with_folders: bool + Flag to determine if the release was pulled with or without folders. + json_version: str + String representing the version of the Darwin JSON Returns ------- @@ -510,9 +512,60 @@ def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, image Path to the image file. """ try: - return images_dir / (Path(darwin_json["item"]["path"].lstrip("/\\"))) / Path(darwin_json["item"]["name"]) - except KeyError: - return images_dir / (Path(darwin_json["image"]["path"].lstrip("/\\"))) / Path(darwin_json["image"]["filename"]) + if json_version == "2.0": + if not with_folders: + return images_dir / Path(darwin_json["item"]["name"]) + else: + return ( + images_dir + / (Path(darwin_json["item"]["path"].lstrip("/\\"))) + / Path(darwin_json["item"]["name"]) + ) + else: + if not with_folders: + return images_dir / Path(darwin_json["image"]["filename"]) + else: + return ( + images_dir + / (Path(darwin_json["image"]["path"].lstrip("/\\"))) + / Path(darwin_json["image"]["filename"]) + ) + except OSError as e: + # Load in the JSON as normal + darwin_json = parse_darwin_json(path=annotation_filepath) + if not with_folders: + return images_dir / Path(darwin_json.filename) + else: + return images_dir / Path(darwin_json.full_path.lstrip("/\\")) + + +def get_darwin_json_version(annotations_dir: Path) -> str: + """ + Returns true is the input Darwin JSON file is 2.0, and False if 1.0. + + Parameters + ---------- + annotations_dir : Path + Path to the directory containing the annotation files. + + Returns + ------- + str + A str representing the Darwin JSON version. + """ + with open(next(annotations_dir.glob("*.json")), "r") as file: + data_str = file.read() + data = json.loads(data_str) + return "2.0" if "version" in data and data["version"] == "2.0" else "1.0" + + +def is_stream_list_empty(json_list: PersistentStreamingJSONList) -> bool: + try: + json_list[0] + except IndexError: + return True + + return False def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile: diff --git a/tests/darwin/dataset/local_dataset_test.py b/tests/darwin/dataset/local_dataset_test.py index 6e0039462..762ad239e 100644 --- a/tests/darwin/dataset/local_dataset_test.py +++ b/tests/darwin/dataset/local_dataset_test.py @@ -3,34 +3,17 @@ import pytest -from darwin.dataset.local_dataset import build_stems +from darwin.dataset.local_dataset import get_annotation_filepaths from tests.fixtures import * @pytest.mark.usefixtures("file_read_write_test") -class TestBuildStems: - def test_look_into_annotations_directory_if_no_partition_specified( - self, team_dataset_release_path: Path, annotations_path: Path, split_path: Path - ): - (annotations_path / "1.json").mkdir() - (annotations_path / "2" / "2.json").mkdir(parents=True) - (annotations_path / "test" / "3" / "3.json").mkdir(parents=True) - - stems = list( - build_stems( - team_dataset_release_path, annotations_path, "tag", split_path.name - ) - ) - - assert "1" in stems - assert "2/2" in stems or "2\\2" in stems - assert "test/3/3" in stems or "test\\3\\3" in stems - +class TestGetAnnotationFilepaths: def test_raise_value_error_if_split_type_is_unknown( self, team_dataset_release_path: Path, annotations_path: Path, split_path: Path ): with pytest.raises(ValueError) as e: - build_stems( + get_annotation_filepaths( team_dataset_release_path, annotations_path, "tag", @@ -41,7 +24,7 @@ def test_raise_value_error_if_split_type_is_unknown( assert str(e.value) == 'Unknown split type "unknown"' - def test_stems_ending_with_spaces( + def test_annotation_filepaths_ending_with_spaces( self, team_dataset_release_path: Path, annotations_path: Path, split_path: Path ): resource_file = ( @@ -49,8 +32,8 @@ def test_stems_ending_with_spaces( ) copyfile(resource_file, split_path / "random_train.txt") - stems = list( - build_stems( + annotation_filepaths = list( + get_annotation_filepaths( team_dataset_release_path, annotations_path, "tag", @@ -59,15 +42,15 @@ def test_stems_ending_with_spaces( ) ) - assert "one" in stems - assert "two " in stems - assert "three " in stems + assert "path/to/annotation/file/one.json " in annotation_filepaths + assert "path/to/annotation/file/two.json " in annotation_filepaths + assert "three.json " in annotation_filepaths def test_raise_file_not_found_if_split_file_does_not_exists( self, team_dataset_release_path: Path, annotations_path: Path, split_path: Path ): with pytest.raises(FileNotFoundError) as e: - build_stems( + get_annotation_filepaths( team_dataset_release_path, annotations_path, "tag", @@ -79,3 +62,24 @@ def test_raise_file_not_found_if_split_file_does_not_exists( str(e.value) == "could not find a dataset partition. Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`" ) + + def test_get_annotation_filepaths_no_partition( + self, team_dataset_release_path: Path, annotations_path: Path, split_path: Path + ): + (annotations_path / "1.json").mkdir() + (annotations_path / "2" / "2.json").mkdir(parents=True) + (annotations_path / "test" / "3" / "3.json").mkdir(parents=True) + + annotation_filepaths = list( + get_annotation_filepaths( + team_dataset_release_path, + annotations_path, + "tag", + split_path.name, + None, + ) + ) + + assert str(annotations_path / "1.json") in annotation_filepaths + assert str(annotations_path / "2/2.json") in annotation_filepaths + assert str(annotations_path / "test/3/3.json") in annotation_filepaths diff --git a/tests/darwin/dataset/resources/random_train b/tests/darwin/dataset/resources/random_train index 441937a5e..1184c018c 100644 --- a/tests/darwin/dataset/resources/random_train +++ b/tests/darwin/dataset/resources/random_train @@ -1,3 +1,3 @@ -one -two -three +path/to/annotation/file/one.json +path/to/annotation/file/two.json +three.json \ No newline at end of file diff --git a/tests/data.zip b/tests/data.zip index 3fd227be3..93a2c3ccf 100644 Binary files a/tests/data.zip and b/tests/data.zip differ