Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AI-1260][internal] add loading of polygon support for object detection datasets #679

Merged
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
7ff1132
added albumentations transform test
ChristofferEdlund Sep 29, 2023
88a2750
updated poetry file
ChristofferEdlund Sep 29, 2023
520f9f3
added albumentations to poetry.lock
ChristofferEdlund Sep 29, 2023
38fb230
added manual install of albumentations
ChristofferEdlund Sep 29, 2023
bc5931c
Merge remote-tracking branch 'origin/master' into ai-1260-add-loading…
ChristofferEdlund Oct 9, 2023
9e83781
added support to load both polygon and bounding-box annotations for o…
ChristofferEdlund Oct 9, 2023
264fe33
commit
ChristofferEdlund Oct 9, 2023
31dc64c
removed test that will be introduced in another pr
ChristofferEdlund Oct 9, 2023
094cc70
added a check for duplicate classes (from polygon and bounding_boxes
ChristofferEdlund Oct 9, 2023
65d43eb
removed code that is not supposed to be in github workflow
ChristofferEdlund Oct 9, 2023
a3dfca9
updated stratified to support bounding_box + polygon
ChristofferEdlund Oct 10, 2023
99cb219
removed some printing
ChristofferEdlund Oct 11, 2023
752b54f
changes based on owen's feedback
ChristofferEdlund Oct 13, 2023
09ba55b
minor update
ChristofferEdlund Oct 17, 2023
4341ca0
Merge remote-tracking branch 'origin/master' into ai-1260-add-loading…
ChristofferEdlund Oct 17, 2023
199a71d
black formatting
ChristofferEdlund Oct 17, 2023
56788cf
reverted classes functionality to old one, but added the ability to l…
ChristofferEdlund Oct 17, 2023
3855279
linter check
ChristofferEdlund Oct 17, 2023
ba78fe1
poetry lock fix
ChristofferEdlund Oct 17, 2023
081f249
manually fixed some ruff issues
ChristofferEdlund Oct 17, 2023
c5f7286
ignoring ruff import * issues in dataset_test.py
ChristofferEdlund Oct 17, 2023
145ce20
refactored local_dataset class to appease ruff (to long init)
ChristofferEdlund Oct 17, 2023
99c4186
added test to extract_classes with multiple annotation types selected
ChristofferEdlund Oct 17, 2023
67dd274
added stratefied split logic to add polygons to bounding_box stratife…
ChristofferEdlund Oct 17, 2023
d128a18
merged from master
ChristofferEdlund Oct 17, 2023
7e1f194
BLACK
ChristofferEdlund Oct 17, 2023
94da955
Merge remote-tracking branch 'origin/master' into ai-1260-add-loading…
ChristofferEdlund Oct 17, 2023
04de9c5
revrting to old init
ChristofferEdlund Oct 17, 2023
57797ca
revrting to old init
ChristofferEdlund Oct 17, 2023
a4431f8
made the refactor more like the original
ChristofferEdlund Oct 17, 2023
0ce35b3
added black
ChristofferEdlund Oct 17, 2023
f2bee69
fixed minor issue
ChristofferEdlund Oct 17, 2023
6aab1ec
removed hard val- and test- set requirements
ChristofferEdlund Oct 18, 2023
0f799a5
is exhaust generator code present now?
ChristofferEdlund Oct 18, 2023
2273fa2
no longer forcing users to have a training split
ChristofferEdlund Oct 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
added black
ChristofferEdlund committed Oct 17, 2023
commit 0ce35b3e59acc44bb2d2afb1a96dc06ef9ea957c
101 changes: 82 additions & 19 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
@@ -71,20 +71,38 @@ def __init__(
self.original_images_path: Optional[List[Path]] = None
self.original_annotations_path: Optional[List[Path]] = None

release_path, annotations_dir, images_dir = self._initial_setup(dataset_path, release_name)
release_path, annotations_dir, images_dir = self._initial_setup(
dataset_path, release_name
)
self._validate_inputs(partition, split_type, annotation_type)
# Get the list of classes

annotation_types = [self.annotation_type]
# We fetch bounding_boxes annotations from selected polygons as well
if self.annotation_type == "bounding_boxes":
annotation_types.append("polygon")
self.classes = get_classes(self.dataset_path, release_name, annotation_type=annotation_types, remove_background=True)
self.classes = get_classes(
self.dataset_path,
release_name,
annotation_type=annotation_types,
remove_background=True,
)
self.num_classes = len(self.classes)
self._setup_annotations_and_images(release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type)
self._setup_annotations_and_images(
release_path,
annotations_dir,
images_dir,
annotation_type,
split,
partition,
split_type,
)

if len(self.images_path) == 0:
raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}")
raise ValueError(
f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file",
f" in {images_dir}",
)

assert len(self.images_path) == len(self.annotations_path)

@@ -94,10 +112,23 @@ def _validate_inputs(self, partition, split_type, annotation_type):
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def _setup_annotations_and_images(self, release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type):
stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type)
def _setup_annotations_and_images(
self,
release_path,
annotations_dir,
images_dir,
annotation_type,
split,
partition,
split_type,
):
stems = build_stems(
release_path, annotations_dir, annotation_type, split, partition, split_type
)
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = []
@@ -110,9 +141,13 @@ def _setup_annotations_and_images(self, release_path, annotations_dir, images_di
if image_path.exists():
images.append(image_path)
if len(images) < 1:
raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
if len(images) > 1:
raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.")
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)
self.images_path.append(images[0])
self.annotations_path.append(annotation_path)

@@ -173,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]:
parsed = parse_darwin_json(self.annotations_path[index], index)
return parsed.image_height, parsed.image_width

def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset":
def extend(
self, dataset: "LocalDataset", extend_classes: bool = False
) -> "LocalDataset":
"""
Extends the current dataset with another one.

@@ -267,7 +304,12 @@ def parse_json(self, index: int) -> Dict[str, Any]:

# Filter out unused classes and annotations of a different type
if self.classes is not None:
annotations = [a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)]
annotations = [
a
for a in annotations
if a.annotation_class.name in self.classes
and self.annotation_type_supported(a)
]
return {
"image_id": index,
"image_path": str(self.images_path[index]),
@@ -282,14 +324,21 @@ def annotation_type_supported(self, annotation) -> bool:
return annotation_type == "tag"
elif self.annotation_type == "bounding_box":
is_bounding_box = annotation_type == "bounding_box"
is_supported_polygon = annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data
is_supported_polygon = (
annotation_type in ["polygon", "complex_polygon"]
and "bounding_box" in annotation.data
)
return is_bounding_box or is_supported_polygon
elif self.annotation_type == "polygon":
return annotation_type in ["polygon", "complex_polygon"]
else:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]:
def measure_mean_std(
self, multi_threaded: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes mean and std of trained images, given the train loader.

@@ -312,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.
results = pool.map(self._return_mean, self.images_path)
mean = np.sum(np.array(results), axis=0) / len(self.images_path)
# Online image_classification deviation
results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path])
results = pool.starmap(
self._return_std, [[item, mean] for item in self.images_path]
)
std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
total_pixel_count = np.sum(np.array([item[1] for item in results]))
std = np.sqrt(std_sum / total_pixel_count)
@@ -358,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray:
@staticmethod
def _return_mean(image_path: Path) -> np.ndarray:
img = np.array(load_pil_image(image_path))
mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])])
mean = np.array(
[np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]
)
return mean / 255.0

# Loads an image with OpenCV and returns the channel wise std of the image.
@staticmethod
def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]:
img = np.array(load_pil_image(image_path)) / 255.0
m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
m2 = np.square(
np.array(
[img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]
)
)
return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0

def __getitem__(self, index: int):
@@ -435,7 +492,10 @@ def build_stems(
"""

if partition is None:
return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json")))
return (
str(e.relative_to(annotations_dir).parent / e.stem)
for e in sorted(annotations_dir.glob("**/*.json"))
)

if split_type == "random":
split_filename = f"{split_type}_{partition}.txt"
@@ -448,4 +508,7 @@ def build_stems(
if split_path.is_file():
return (e.strip("\n\r") for e in split_path.open())

raise FileNotFoundError("could not find a dataset partition. " "Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`")
raise FileNotFoundError(
"could not find a dataset partition. "
"Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`"
)