Skip to content

Commit

Permalink
[IO-1445][external] Changes to LocalDataset() & get_annotations() to …
Browse files Browse the repository at this point in the history
…account for local releases pulled with folders (#678)

* Changes to LocalDataset() & get_annotations() to account for releases pulled with folders

* Preserve order of annotation files in LocalDataset constructor

* Fixed GH filtering & restored accidentally changed/deleted files

* Change to ensure tests can run without formatting & linting

* Added support for JSON streaming to improve speed when parsing Darwin JSON in some scenarios

* Updated changes to be compatible with AI-1260

---------

Co-authored-by: John Wilkie <[email protected]>
  • Loading branch information
JBWilkie and JBWilkie authored Oct 24, 2023
1 parent fae7714 commit 23d4d16
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 54 deletions.
37 changes: 15 additions & 22 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@
from PIL import Image as PILImage

from darwin.dataset.utils import get_classes, get_release_path, load_pil_image
from darwin.utils import SUPPORTED_IMAGE_EXTENSIONS, parse_darwin_json
from darwin.utils import (
SUPPORTED_IMAGE_EXTENSIONS,
get_image_path_from_stream,
parse_darwin_json,
stream_darwin_json,
)


class LocalDataset:
Expand Down Expand Up @@ -126,30 +131,18 @@ def _setup_annotations_and_images(
partition,
split_type,
):
stems = build_stems(
release_path, annotations_dir, annotation_type, split, partition, split_type
)
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
for ext in SUPPORTED_IMAGE_EXTENSIONS:
image_path = images_dir / f"{stem}{ext}"
if image_path.exists():
images.append(image_path)
continue
image_path = images_dir / f"{stem}{ext.upper()}"
if image_path.exists():
images.append(image_path)
if len(images) < 1:
# Find all the annotations and their corresponding images
for annotation_path in sorted(annotations_dir.glob("**/*.json")):
darwin_json = stream_darwin_json(annotation_path)
image_path = get_image_path_from_stream(darwin_json, images_dir)
if image_path.exists():
self.images_path.append(image_path)
self.annotations_path.append(annotation_path)
continue
else:
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
if len(images) > 1:
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)
self.images_path.append(images[0])
self.annotations_path.append(annotation_path)

def _initial_setup(self, dataset_path, release_name):
assert dataset_path is not None
Expand Down
40 changes: 14 additions & 26 deletions darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
SUPPORTED_EXTENSIONS,
SUPPORTED_VIDEO_EXTENSIONS,
attempt_decode,
get_image_path_from_stream,
is_unix_like_os,
parse_darwin_json,
)
from darwin.utils.utils import stream_darwin_json

# E.g.: {"partition" => {"class_name" => 123}}
AnnotationDistribution = Dict[str, Counter]
Expand Down Expand Up @@ -569,33 +571,19 @@ def _map_annotations_to_images(
images_paths = []
annotations_paths = []
invalid_annotation_paths = []
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
for ext in SUPPORTED_EXTENSIONS:
image_path = images_dir / f"{stem}{ext}"
if image_path.exists():
images.append(image_path)
continue
image_path = images_dir / f"{stem}{ext.upper()}"
if image_path.exists():
images.append(image_path)

image_count = len(images)
if image_count != 1 and ignore_inconsistent_examples:
invalid_annotation_paths.append(annotation_path)
for annotation_path in annotations_dir.glob("**/*.json"):
darwin_json = stream_darwin_json(annotation_path)
image_path = get_image_path_from_stream(darwin_json, images_dir)
if image_path.exists():
images_paths.append(image_path)
annotations_paths.append(annotation_path)
continue
elif image_count < 1:
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
elif image_count > 1:
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)

images_paths.append(images[0])
annotations_paths.append(annotation_path)
else:
if ignore_inconsistent_examples:
invalid_annotation_paths.append(annotation_path)
continue
else:
raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")

return images_paths, annotations_paths, invalid_annotation_paths

Expand Down
41 changes: 41 additions & 0 deletions darwin/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
)

import deprecation
import json_stream
import numpy as np
import orjson as json
import requests
from json_stream.base import PersistentStreamingJSONObject
from jsonschema import exceptions, validators
from requests import Response, request
from rich.progress import ProgressType, track
Expand Down Expand Up @@ -454,6 +456,45 @@ def parse_darwin_json(path: Path, count: Optional[int] = None) -> Optional[dt.An
else:
return _parse_darwin_image(path, data, count)

def stream_darwin_json(path: Path) -> PersistentStreamingJSONObject:
"""
Returns a Darwin JSON file as a persistent stream. This allows for parsing large files without
loading them entirely into memory.
Parameters
----------
path : Path
Path to the file to parse.
Returns
-------
PersistentStreamingJSONObject
A stream of the JSON file.
"""

with path.open() as infile:
return json_stream.load(infile, persistent=True)

def get_image_path_from_stream(darwin_json: PersistentStreamingJSONObject, images_dir: Path) -> Path:
"""
Returns the path to the image file associated with the given darwin json file (V1 or V2).
Parameters
----------
darwin_json : PersistentStreamingJSONObject
A stream of the JSON file.
images_dir : Path
Path to the directory containing the images.
Returns
-------
Path
Path to the image file.
"""
try:
return images_dir / (Path(darwin_json['item']['path'].lstrip('/\\'))) / Path(darwin_json['item']['name'])
except KeyError:
return images_dir / (Path(darwin_json['image']['path'].lstrip('/\\'))) / Path(darwin_json['image']['filename'])

def _parse_darwin_v2(path: Path, data: Dict[str, Any]) -> dt.AnnotationFile:
item = data["item"]
Expand Down
Loading

0 comments on commit 23d4d16

Please sign in to comment.