Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IO-1445][external] Changes to LocalDataset() & get_annotations() to account for local releases pulled with folders #678

Merged
merged 10 commits into from
Oct 24, 2023
Merged
37 changes: 15 additions & 22 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
from PIL import Image as PILImage

from darwin.dataset.utils import get_classes, get_release_path, load_pil_image
from darwin.utils import SUPPORTED_IMAGE_EXTENSIONS, parse_darwin_json
from darwin.utils import (
SUPPORTED_IMAGE_EXTENSIONS,
get_image_path_from_stream,
parse_darwin_json,
stream_darwin_json,
)


class LocalDataset:
Expand Down Expand Up @@ -126,30 +131,18 @@ def _setup_annotations_and_images(
partition,
split_type,
):
stems = build_stems(
release_path, annotations_dir, annotation_type, split, partition, split_type
)
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
for ext in SUPPORTED_IMAGE_EXTENSIONS:
image_path = images_dir / f"{stem}{ext}"
if image_path.exists():
images.append(image_path)
continue
image_path = images_dir / f"{stem}{ext.upper()}"
if image_path.exists():
images.append(image_path)
if len(images) < 1:
# Find all the annotations and their corresponding images
for annotation_path in sorted(annotations_dir.glob("**/*.json")):
darwin_json = stream_darwin_json(annotation_path)
image_path = get_image_path_from_stream(darwin_json, images_dir)
if image_path.exists():
self.images_path.append(image_path)
self.annotations_path.append(annotation_path)
continue
else:
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
if len(images) > 1:
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)
self.images_path.append(images[0])
self.annotations_path.append(annotation_path)

def _initial_setup(self, dataset_path, release_name):
assert dataset_path is not None
Expand Down
40 changes: 14 additions & 26 deletions darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
SUPPORTED_EXTENSIONS,
SUPPORTED_VIDEO_EXTENSIONS,
attempt_decode,
get_image_path_from_stream,
is_unix_like_os,
parse_darwin_json,
)
from darwin.utils.utils import stream_darwin_json

# E.g.: {"partition" => {"class_name" => 123}}
AnnotationDistribution = Dict[str, Counter]
Expand Down Expand Up @@ -569,33 +571,19 @@ def _map_annotations_to_images(
images_paths = []
annotations_paths = []
invalid_annotation_paths = []
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
for ext in SUPPORTED_EXTENSIONS:
image_path = images_dir / f"{stem}{ext}"
if image_path.exists():
images.append(image_path)
continue
image_path = images_dir / f"{stem}{ext.upper()}"
if image_path.exists():
images.append(image_path)

image_count = len(images)
if image_count != 1 and ignore_inconsistent_examples:
invalid_annotation_paths.append(annotation_path)
for annotation_path in annotations_dir.glob("**/*.json"):
darwin_json = stream_darwin_json(annotation_path)
image_path = get_image_path_from_stream(darwin_json, images_dir)
if image_path.exists():
images_paths.append(image_path)
annotations_paths.append(annotation_path)
continue
elif image_count < 1:
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
elif image_count > 1:
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)

images_paths.append(images[0])
annotations_paths.append(annotation_path)
else:
if ignore_inconsistent_examples:
invalid_annotation_paths.append(annotation_path)
continue
else:
raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")

return images_paths, annotations_paths, invalid_annotation_paths

Expand Down
41 changes: 41 additions & 0 deletions darwin/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
)

import deprecation
import json_stream
import numpy as np
import orjson as json
import requests
from json_stream.base import PersistentStreamingJSONObject
from jsonschema import exceptions, validators
from requests import Response, request
from rich.progress import ProgressType, track
Expand Down Expand Up @@ -454,6 +456,45 @@ def parse_darwin_json(path: Path, count: Optional[int] = None) -> Optional[dt.An
else:
return _parse_darwin_image(path, data, count)

def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject:
"""
Returns a Darwin JSON file as a persistent stream. This allows for parsing large files without
loading them entirely into memory.

Parameters
----------
path : Path
Path to the file to parse.

Returns
-------
PersistentStreamingJSONObject
A stream of the JSON file.
"""

with path.open() as infile:
return json_stream.load(infile, persistent=True)

def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, images_dir: Path) -> Path:
"""
Returns the path to the image file associated with the given darwin json file (V1 or V2).

Parameters
----------
darwin_json : PersistentStreamingJSONObject
A stream of the JSON file.
images_dir : Path
Path to the directory containing the images.

Returns
-------
Path
Path to the image file.
"""
try:
return images_dir / (Path(darwin_json['item']['path'].lstrip('/\\'))) / Path(darwin_json['item']['name'])
except KeyError:
return images_dir / (Path(darwin_json['image']['path'].lstrip('/\\'))) / Path(darwin_json['image']['filename'])

def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
item = data["item"]
Expand Down
Loading