Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AI-1260][internal] add loading of polygon support for object detection datasets #679

Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
7ff1132
added albumentations transform test
ChristofferEdlund Sep 29, 2023
88a2750
updated poetry file
ChristofferEdlund Sep 29, 2023
520f9f3
added albumentations to poetry.lock
ChristofferEdlund Sep 29, 2023
38fb230
added manual install of albumentations
ChristofferEdlund Sep 29, 2023
bc5931c
Merge remote-tracking branch 'origin/master' into ai-1260-add-loading…
ChristofferEdlund Oct 9, 2023
9e83781
added support to load both polygon and bounding-box annotations for o…
ChristofferEdlund Oct 9, 2023
264fe33
commit
ChristofferEdlund Oct 9, 2023
31dc64c
removed test that will be introduced in another pr
ChristofferEdlund Oct 9, 2023
094cc70
added a check for duplicate classes (from polygon and bounding_boxes
ChristofferEdlund Oct 9, 2023
65d43eb
removed code that is not supposed to be in github workflow
ChristofferEdlund Oct 9, 2023
a3dfca9
updated stratified to support bounding_box + polygon
ChristofferEdlund Oct 10, 2023
99cb219
removed some printing
ChristofferEdlund Oct 11, 2023
752b54f
changes based on owen's feedback
ChristofferEdlund Oct 13, 2023
09ba55b
minor update
ChristofferEdlund Oct 17, 2023
4341ca0
Merge remote-tracking branch 'origin/master' into ai-1260-add-loading…
ChristofferEdlund Oct 17, 2023
199a71d
black formatting
ChristofferEdlund Oct 17, 2023
56788cf
reverted classes functionality to old one, but added the ability to l…
ChristofferEdlund Oct 17, 2023
3855279
linter check
ChristofferEdlund Oct 17, 2023
ba78fe1
poetry lock fix
ChristofferEdlund Oct 17, 2023
081f249
manually fixed some ruff issues
ChristofferEdlund Oct 17, 2023
c5f7286
ignoring ruff import * issues in dataset_test.py
ChristofferEdlund Oct 17, 2023
145ce20
refactored local_dataset class to appease ruff (to long init)
ChristofferEdlund Oct 17, 2023
99c4186
added test to extract_classes with multiple annotation types selected
ChristofferEdlund Oct 17, 2023
67dd274
added stratefied split logic to add polygons to bounding_box stratife…
ChristofferEdlund Oct 17, 2023
d128a18
merged from master
ChristofferEdlund Oct 17, 2023
7e1f194
BLACK
ChristofferEdlund Oct 17, 2023
94da955
Merge remote-tracking branch 'origin/master' into ai-1260-add-loading…
ChristofferEdlund Oct 17, 2023
04de9c5
revrting to old init
ChristofferEdlund Oct 17, 2023
57797ca
revrting to old init
ChristofferEdlund Oct 17, 2023
a4431f8
made the refactor more like the original
ChristofferEdlund Oct 17, 2023
0ce35b3
added black
ChristofferEdlund Oct 17, 2023
f2bee69
fixed minor issue
ChristofferEdlund Oct 17, 2023
6aab1ec
removed hard val- and test- set requirements
ChristofferEdlund Oct 18, 2023
0f799a5
is exhaust generator code present now?
ChristofferEdlund Oct 18, 2023
2273fa2
no longer forcing users to have a training split
ChristofferEdlund Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 96 additions & 33 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Any, Dict, Iterator, List, Optional, Tuple

import numpy as np
import orjson as json
from PIL import Image as PILImage

from darwin.dataset.utils import get_classes, get_release_path, load_pil_image
Expand Down Expand Up @@ -64,20 +63,6 @@ def __init__(
split_type: str = "random",
release_name: Optional[str] = None,
):
assert dataset_path is not None
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactoring the init function to make ruff happy

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this refactoring - thanks ruff!

release_path = get_release_path(dataset_path, release_name)
annotations_dir = release_path / "annotations"
assert annotations_dir.exists()
images_dir = dataset_path / "images"
assert images_dir.exists()

if partition not in ["train", "val", "test", None]:
raise ValueError("partition should be either 'train', 'val', or 'test'")
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")

self.dataset_path = dataset_path
self.annotation_type = annotation_type
self.images_path: List[Path] = []
Expand All @@ -86,15 +71,64 @@ def __init__(
self.original_images_path: Optional[List[Path]] = None
self.original_annotations_path: Optional[List[Path]] = None

release_path, annotations_dir, images_dir = self._initial_setup(
dataset_path, release_name
)
self._validate_inputs(partition, split_type, annotation_type)
# Get the list of classes

annotation_types = [self.annotation_type]
# We fetch bounding_boxes annotations from selected polygons as well
if self.annotation_type == "bounding_box":
annotation_types.append("polygon")
self.classes = get_classes(
self.dataset_path, release_name, annotation_type=self.annotation_type, remove_background=True
self.dataset_path,
release_name,
annotation_type=annotation_types,
remove_background=True,
)
self.num_classes = len(self.classes)
self._setup_annotations_and_images(
release_path,
annotations_dir,
images_dir,
annotation_type,
split,
partition,
split_type,
)

if len(self.images_path) == 0:
raise ValueError(
f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file",
f" in {images_dir}",
)

assert len(self.images_path) == len(self.annotations_path)

stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type)
def _validate_inputs(self, partition, split_type, annotation_type):
if partition not in ["train", "val", "test", None]:
raise ValueError("partition should be either 'train', 'val', or 'test'")
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

# Find all the annotations and their corresponding images
def _setup_annotations_and_images(
self,
release_path,
annotations_dir,
images_dir,
annotation_type,
split,
partition,
split_type,
):
stems = build_stems(
release_path, annotations_dir, annotation_type, split, partition, split_type
)
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
Expand All @@ -107,16 +141,24 @@ def __init__(
if image_path.exists():
images.append(image_path)
if len(images) < 1:
raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
if len(images) > 1:
raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.")
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)
self.images_path.append(images[0])
self.annotations_path.append(annotation_path)

if len(self.images_path) == 0:
raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}")

assert len(self.images_path) == len(self.annotations_path)
def _initial_setup(self, dataset_path, release_name):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the extraction here, even if it is only to make Ruff happy!

assert dataset_path is not None
release_path = get_release_path(dataset_path, release_name)
annotations_dir = release_path / "annotations"
assert annotations_dir.exists()
images_dir = dataset_path / "images"
assert images_dir.exists()
return release_path, annotations_dir, images_dir

def get_img_info(self, index: int) -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -166,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]:
parsed = parse_darwin_json(self.annotations_path[index], index)
return parsed.image_height, parsed.image_width

def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset":
def extend(
self, dataset: "LocalDataset", extend_classes: bool = False
) -> "LocalDataset":
"""
Extends the current dataset with another one.

Expand Down Expand Up @@ -261,7 +305,10 @@ def parse_json(self, index: int) -> Dict[str, Any]:
# Filter out unused classes and annotations of a different type
if self.classes is not None:
annotations = [
a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)
a
for a in annotations
if a.annotation_class.name in self.classes
and self.annotation_type_supported(a)
]
return {
"image_id": index,
Expand All @@ -278,15 +325,20 @@ def annotation_type_supported(self, annotation) -> bool:
elif self.annotation_type == "bounding_box":
is_bounding_box = annotation_type == "bounding_box"
is_supported_polygon = (
annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data
annotation_type in ["polygon", "complex_polygon"]
and "bounding_box" in annotation.data
)
return is_bounding_box or is_supported_polygon
elif self.annotation_type == "polygon":
return annotation_type in ["polygon", "complex_polygon"]
else:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]:
def measure_mean_std(
self, multi_threaded: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes mean and std of trained images, given the train loader.

Expand All @@ -309,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.
results = pool.map(self._return_mean, self.images_path)
mean = np.sum(np.array(results), axis=0) / len(self.images_path)
# Online image_classification deviation
results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path])
results = pool.starmap(
self._return_std, [[item, mean] for item in self.images_path]
)
std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
total_pixel_count = np.sum(np.array([item[1] for item in results]))
std = np.sqrt(std_sum / total_pixel_count)
Expand Down Expand Up @@ -355,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray:
@staticmethod
def _return_mean(image_path: Path) -> np.ndarray:
img = np.array(load_pil_image(image_path))
mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])])
mean = np.array(
[np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]
)
return mean / 255.0

# Loads an image with OpenCV and returns the channel wise std of the image.
@staticmethod
def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]:
img = np.array(load_pil_image(image_path)) / 255.0
m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
m2 = np.square(
np.array(
[img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]
)
)
return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0

def __getitem__(self, index: int):
Expand Down Expand Up @@ -432,7 +492,10 @@ def build_stems(
"""

if partition is None:
return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json")))
return (
str(e.relative_to(annotations_dir).parent / e.stem)
for e in sorted(annotations_dir.glob("**/*.json"))
)

if split_type == "random":
split_filename = f"{split_type}_{partition}.txt"
Expand Down
80 changes: 65 additions & 15 deletions darwin/dataset/split_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ def split_dataset(
train_size: int = dataset_size - val_size - test_size
split_id = f"{train_size}_{val_size}_{test_size}"

assert train_size > 0, f"Found {train_size} train examples, we need at least 1"
assert val_size > 0, f"Found {val_size} validation examples, we need at least 1"
assert test_size > 0, f"Found {test_size} test examples, we need at least 1"
almazan marked this conversation as resolved.
Show resolved Hide resolved

# Compute split id, a combination of val precentage, test percentage and split seed
# The split id is used to create a folder with the same name in the "lists" folder
if split_seed != 0:
Expand Down Expand Up @@ -228,7 +232,12 @@ def _stratified_split(
return

for stratified_type in stratified_types:
_, idx_to_classes = extract_classes(annotation_path, stratified_type)
if stratified_type == "bounding_box":
class_annotation_types = [stratified_type, "polygon"]
else:
class_annotation_types = stratified_type

_, idx_to_classes = extract_classes(annotation_path, class_annotation_types)
if len(idx_to_classes) == 0:
continue

Expand All @@ -252,13 +261,32 @@ def _stratified_split(
else:
test_indices.append(idx)

_write_to_file(annotation_path, annotation_files, split[stratified_type]["train"], train_indices)
_write_to_file(annotation_path, annotation_files, split[stratified_type]["val"], val_indices)
_write_to_file(annotation_path, annotation_files, split[stratified_type]["test"], test_indices)
_write_to_file(
annotation_path,
annotation_files,
split[stratified_type]["train"],
train_indices,
)
_write_to_file(
annotation_path,
annotation_files,
split[stratified_type]["val"],
val_indices,
)
_write_to_file(
annotation_path,
annotation_files,
split[stratified_type]["test"],
test_indices,
)


def _stratify_samples(
idx_to_classes: Dict[int, Set[str]], split_seed: int, train_size: int, val_size: int, test_size: int
idx_to_classes: Dict[int, Set[str]],
split_seed: int,
train_size: int,
val_size: int,
test_size: int,
) -> Tuple[List[int], List[int], List[int]]:
"""Splits the list of indices into train, val and test according to their labels (stratified)

Expand Down Expand Up @@ -292,8 +320,8 @@ def _stratify_samples(
# Extract entries whose support set is 1 (it would make sklearn crash) and append the to train later
unique_labels, count = np.unique(labels, return_counts=True)
single_files = []
for l in unique_labels[count == 1]:
index = np.where(labels == l)[0][0]
for label in unique_labels[count == 1]:
index = np.where(labels == label)[0][0]
single_files.append(file_indices[index])
labels = np.delete(labels, index)
file_indices = np.delete(file_indices, index)
Expand Down Expand Up @@ -330,7 +358,11 @@ def _stratify_samples(
# Remove duplicates within the same set
# NOTE: doing that earlier (e.g. in _remove_cross_contamination()) would produce mathematical
# mistakes in the class balancing between validation and test sets.
return (list(set(X_train.astype(int))), list(set(X_val.astype(int))), list(set(X_test.astype(int))))
return (
list(set(X_train.astype(int))),
list(set(X_val.astype(int))),
list(set(X_test.astype(int))),
)


def _remove_cross_contamination(
Expand Down Expand Up @@ -390,20 +422,33 @@ def _unique(array: np.ndarray) -> np.ndarray:
return array[sorted(indexes)]


def _write_to_file(annotation_path: Path, annotation_files: List[Path], file_path: Path, split_idx: Iterable) -> None:
def _write_to_file(
annotation_path: Path,
annotation_files: List[Path],
file_path: Path,
split_idx: Iterable,
) -> None:
with open(str(file_path), "w") as f:
for i in split_idx:
# To deal with recursive search, we want to write the difference between the annotation path
# and its parent, without the file extension
stem = str(annotation_files[i]).replace(f"{annotation_path}/", "").rsplit(".json", 1)[0]
stem = (
str(annotation_files[i])
.replace(f"{annotation_path}/", "")
.rsplit(".json", 1)[0]
)
f.write(f"{stem}\n")


def _validate_split(val_percentage: float, test_percentage: float) -> None:
if val_percentage is None or not 0 < val_percentage < 1:
raise ValueError(f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1.")
raise ValueError(
f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1."
)
if test_percentage is None or not 0 < test_percentage < 1:
raise ValueError(f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1.")
raise ValueError(
f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1."
)
if val_percentage + test_percentage >= 1:
raise ValueError(
f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. "
Expand All @@ -412,18 +457,23 @@ def _validate_split(val_percentage: float, test_percentage: float) -> None:


def _build_split(
split_path: Path, stratified_types: List[str], partitions: List[str] = ["train", "val", "test"]
split_path: Path,
stratified_types: List[str],
partitions: List[str] = ["train", "val", "test"],
) -> Split:
split = Split()

split.random = {partition: split_path / f"random_{partition}.txt" for partition in partitions}
split.random = {
partition: split_path / f"random_{partition}.txt" for partition in partitions
}
if len(stratified_types) == 0:
return split

stratified_dict: Dict[str, Dict[str, Path]] = {}
for stratified_type in stratified_types:
stratified_dict[stratified_type] = {
partition: split_path / f"stratified_{stratified_type}_{partition}.txt" for partition in partitions
partition: split_path / f"stratified_{stratified_type}_{partition}.txt"
for partition in partitions
}
split.stratified = stratified_dict
return split
Loading