Skip to content

Commit

Permalink
BLACK
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristofferEdlund committed Oct 17, 2023
1 parent d128a18 commit 7e1f194
Show file tree
Hide file tree
Showing 5 changed files with 431 additions and 147 deletions.
106 changes: 86 additions & 20 deletions darwin/dataset/local_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,20 +71,38 @@ def __init__(
self.original_images_path: Optional[List[Path]] = None
self.original_annotations_path: Optional[List[Path]] = None

release_path, annotations_dir, images_dir = self._initial_setup(dataset_path, release_name)
release_path, annotations_dir, images_dir = self._initial_setup(
dataset_path, release_name
)
self._validate_inputs(partition, split_type, annotation_type)
# Get the list of classes

annotation_types = [self.annotation_type]
# We fetch bounding_boxes annotations from selected polygons as well
if self.annotation_type == "bounding_boxes":
annotation_types.append("polygon")
self.classes = get_classes(self.dataset_path, release_name, annotation_type=annotation_types, remove_background=True)
self.classes = get_classes(
self.dataset_path,
release_name,
annotation_type=annotation_types,
remove_background=True,
)
self.num_classes = len(self.classes)
self._setup_annotations_and_images(release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type)
self._setup_annotations_and_images(
release_path,
annotations_dir,
images_dir,
annotation_type,
split,
partition,
split_type,
)

if len(self.images_path) == 0:
raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}")
raise ValueError(
f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file",
f" in {images_dir}",
)

assert len(self.images_path) == len(self.annotations_path)

Expand All @@ -94,22 +112,42 @@ def _validate_inputs(self, partition, split_type, annotation_type):
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def _setup_annotations_and_images(self, release_path, annotations_dir, images_dir, annotation_type, split, partition, split_type):
stems = build_stems(release_path, annotations_dir, annotation_type, split, partition, split_type)
def _setup_annotations_and_images(
self,
release_path,
annotations_dir,
images_dir,
annotation_type,
split,
partition,
split_type,
):
stems = build_stems(
release_path, annotations_dir, annotation_type, split, partition, split_type
)
for stem in stems:
annotation_path = annotations_dir / f"{stem}.json"
images = [
image_path
for ext in SUPPORTED_IMAGE_EXTENSIONS
for image_path in [images_dir / f"{stem}{ext}", images_dir / f"{stem}{ext.upper()}"]
for image_path in [
images_dir / f"{stem}{ext}",
images_dir / f"{stem}{ext.upper()}",
]
if image_path.exists()
]
if len(images) < 1:
raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image")
raise ValueError(
f"Annotation ({annotation_path}) does not have a corresponding image"
)
if len(images) > 1:
raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.")
raise ValueError(
f"Image ({stem}) is present with multiple extensions. This is forbidden."
)
self.images_path.append(images[0])
self.annotations_path.append(annotation_path)

Expand Down Expand Up @@ -170,7 +208,9 @@ def get_height_and_width(self, index: int) -> Tuple[float, float]:
parsed = parse_darwin_json(self.annotations_path[index], index)
return parsed.image_height, parsed.image_width

def extend(self, dataset: "LocalDataset", extend_classes: bool = False) -> "LocalDataset":
def extend(
self, dataset: "LocalDataset", extend_classes: bool = False
) -> "LocalDataset":
"""
Extends the current dataset with another one.
Expand Down Expand Up @@ -264,7 +304,12 @@ def parse_json(self, index: int) -> Dict[str, Any]:

# Filter out unused classes and annotations of a different type
if self.classes is not None:
annotations = [a for a in annotations if a.annotation_class.name in self.classes and self.annotation_type_supported(a)]
annotations = [
a
for a in annotations
if a.annotation_class.name in self.classes
and self.annotation_type_supported(a)
]
return {
"image_id": index,
"image_path": str(self.images_path[index]),
Expand All @@ -279,14 +324,21 @@ def annotation_type_supported(self, annotation) -> bool:
return annotation_type == "tag"
elif self.annotation_type == "bounding_box":
is_bounding_box = annotation_type == "bounding_box"
is_supported_polygon = annotation_type in ["polygon", "complex_polygon"] and "bounding_box" in annotation.data
is_supported_polygon = (
annotation_type in ["polygon", "complex_polygon"]
and "bounding_box" in annotation.data
)
return is_bounding_box or is_supported_polygon
elif self.annotation_type == "polygon":
return annotation_type in ["polygon", "complex_polygon"]
else:
raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'")
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)

def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.ndarray]:
def measure_mean_std(
self, multi_threaded: bool = True
) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes mean and std of trained images, given the train loader.
Expand All @@ -309,7 +361,9 @@ def measure_mean_std(self, multi_threaded: bool = True) -> Tuple[np.ndarray, np.
results = pool.map(self._return_mean, self.images_path)
mean = np.sum(np.array(results), axis=0) / len(self.images_path)
# Online image_classification deviation
results = pool.starmap(self._return_std, [[item, mean] for item in self.images_path])
results = pool.starmap(
self._return_std, [[item, mean] for item in self.images_path]
)
std_sum = np.sum(np.array([item[0] for item in results]), axis=0)
total_pixel_count = np.sum(np.array([item[1] for item in results]))
std = np.sqrt(std_sum / total_pixel_count)
Expand Down Expand Up @@ -355,14 +409,20 @@ def _compute_weights(labels: List[int]) -> np.ndarray:
@staticmethod
def _return_mean(image_path: Path) -> np.ndarray:
img = np.array(load_pil_image(image_path))
mean = np.array([np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])])
mean = np.array(
[np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])]
)
return mean / 255.0

# Loads an image with OpenCV and returns the channel wise std of the image.
@staticmethod
def _return_std(image_path: Path, mean: np.ndarray) -> Tuple[np.ndarray, float]:
img = np.array(load_pil_image(image_path)) / 255.0
m2 = np.square(np.array([img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]))
m2 = np.square(
np.array(
[img[:, :, 0] - mean[0], img[:, :, 1] - mean[1], img[:, :, 2] - mean[2]]
)
)
return np.sum(np.sum(m2, axis=1), 1), m2.size / 3.0

def __getitem__(self, index: int):
Expand Down Expand Up @@ -432,7 +492,10 @@ def build_stems(
"""

if partition is None:
return (str(e.relative_to(annotations_dir).parent / e.stem) for e in sorted(annotations_dir.glob("**/*.json")))
return (
str(e.relative_to(annotations_dir).parent / e.stem)
for e in sorted(annotations_dir.glob("**/*.json"))
)

if split_type == "random":
split_filename = f"{split_type}_{partition}.txt"
Expand All @@ -445,4 +508,7 @@ def build_stems(
if split_path.is_file():
return (e.strip("\n\r") for e in split_path.open())

raise FileNotFoundError("could not find a dataset partition. " "Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`")
raise FileNotFoundError(
"could not find a dataset partition. "
"Split the dataset using `split_dataset()` from `darwin.dataset.split_manager`"
)
76 changes: 62 additions & 14 deletions darwin/dataset/split_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ def split_dataset(
try:
import sklearn # noqa
except ImportError:
raise ImportError("Darwin requires scikit-learn to split a dataset. Install it using: pip install scikit-learn") from None
raise ImportError(
"Darwin requires scikit-learn to split a dataset. Install it using: pip install scikit-learn"
) from None

_validate_split(val_percentage, test_percentage)

Expand Down Expand Up @@ -259,13 +261,32 @@ def _stratified_split(
else:
test_indices.append(idx)

_write_to_file(annotation_path, annotation_files, split[stratified_type]["train"], train_indices)
_write_to_file(annotation_path, annotation_files, split[stratified_type]["val"], val_indices)
_write_to_file(annotation_path, annotation_files, split[stratified_type]["test"], test_indices)
_write_to_file(
annotation_path,
annotation_files,
split[stratified_type]["train"],
train_indices,
)
_write_to_file(
annotation_path,
annotation_files,
split[stratified_type]["val"],
val_indices,
)
_write_to_file(
annotation_path,
annotation_files,
split[stratified_type]["test"],
test_indices,
)


def _stratify_samples(
idx_to_classes: Dict[int, Set[str]], split_seed: int, train_size: int, val_size: int, test_size: int
idx_to_classes: Dict[int, Set[str]],
split_seed: int,
train_size: int,
val_size: int,
test_size: int,
) -> Tuple[List[int], List[int], List[int]]:
"""Splits the list of indices into train, val and test according to their labels (stratified)
Expand Down Expand Up @@ -337,7 +358,11 @@ def _stratify_samples(
# Remove duplicates within the same set
# NOTE: doing that earlier (e.g. in _remove_cross_contamination()) would produce mathematical
# mistakes in the class balancing between validation and test sets.
return (list(set(X_train.astype(int))), list(set(X_val.astype(int))), list(set(X_test.astype(int))))
return (
list(set(X_train.astype(int))),
list(set(X_val.astype(int))),
list(set(X_test.astype(int))),
)


def _remove_cross_contamination(
Expand Down Expand Up @@ -397,35 +422,58 @@ def _unique(array: np.ndarray) -> np.ndarray:
return array[sorted(indexes)]


def _write_to_file(annotation_path: Path, annotation_files: List[Path], file_path: Path, split_idx: Iterable) -> None:
def _write_to_file(
annotation_path: Path,
annotation_files: List[Path],
file_path: Path,
split_idx: Iterable,
) -> None:
with open(str(file_path), "w") as f:
for i in split_idx:
# To deal with recursive search, we want to write the difference between the annotation path
# and its parent, without the file extension
stem = str(annotation_files[i]).replace(f"{annotation_path}/", "").rsplit(".json", 1)[0]
stem = (
str(annotation_files[i])
.replace(f"{annotation_path}/", "")
.rsplit(".json", 1)[0]
)
f.write(f"{stem}\n")


def _validate_split(val_percentage: float, test_percentage: float) -> None:
if val_percentage is None or not 0 < val_percentage < 1:
raise ValueError(f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1.")
raise ValueError(
f"Invalid validation percentage ({val_percentage}). Must be a float x, where 0 < x < 1."
)
if test_percentage is None or not 0 < test_percentage < 1:
raise ValueError(f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1.")
raise ValueError(
f"Invalid test percentage ({test_percentage}). Must be a float x, where 0 < x < 1."
)
if val_percentage + test_percentage >= 1:
raise ValueError(
f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. " f"Their sum must be a value x, where x < 1."
f"Invalid combination of validation ({val_percentage}) and test ({test_percentage}) percentages. "
f"Their sum must be a value x, where x < 1."
)


def _build_split(split_path: Path, stratified_types: List[str], partitions: List[str] = ["train", "val", "test"]) -> Split:
def _build_split(
split_path: Path,
stratified_types: List[str],
partitions: List[str] = ["train", "val", "test"],
) -> Split:
split = Split()

split.random = {partition: split_path / f"random_{partition}.txt" for partition in partitions}
split.random = {
partition: split_path / f"random_{partition}.txt" for partition in partitions
}
if len(stratified_types) == 0:
return split

stratified_dict: Dict[str, Dict[str, Path]] = {}
for stratified_type in stratified_types:
stratified_dict[stratified_type] = {partition: split_path / f"stratified_{stratified_type}_{partition}.txt" for partition in partitions}
stratified_dict[stratified_type] = {
partition: split_path / f"stratified_{stratified_type}_{partition}.txt"
for partition in partitions
}
split.stratified = stratified_dict
return split
Loading

0 comments on commit 7e1f194

Please sign in to comment.