Skip to content

Commit

Permalink
[datasets] Allow detection task for built-in datasets (#1717)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 authored Oct 1, 2024
1 parent dccc26b commit 7f6757c
Show file tree
Hide file tree
Showing 18 changed files with 586 additions and 125 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ quality:

# this target runs checks on all files and potentially modifies some of them
style:
ruff check --fix .
ruff format .
ruff check --fix .

# Run tests for the library
test:
Expand Down
19 changes: 17 additions & 2 deletions docs/source/using_doctr/using_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ This datasets contains the information to train or validate a text detection mod
from doctr.datasets import CORD
# Load straight boxes
train_set = CORD(train=True, download=True)
train_set = CORD(train=True, download=True, detection_task=True)
# Load rotated boxes
train_set = CORD(train=True, download=True, use_polygons=True)
train_set = CORD(train=True, download=True, use_polygons=True, detection_task=True)
img, target = train_set[0]
Expand Down Expand Up @@ -99,6 +99,21 @@ This datasets contains the information to train or validate a text recognition m
img, target = train_set[0]
OCR
^^^

The same dataset table as for detection, but with information about the bounding boxes and labels.

.. code:: python3
from doctr.datasets import CORD
# Load straight boxes
train_set = CORD(train=True, download=True)
# Load rotated boxes
train_set = CORD(train=True, download=True, use_polygons=True)
img, target = train_set[0]
Object Detection
^^^^^^^^^^^^^^^^

Expand Down
11 changes: 10 additions & 1 deletion doctr/datasets/cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class CORD(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -53,6 +54,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
url, sha256, name = self.TRAIN if train else self.TEST
Expand All @@ -64,10 +66,15 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

# List images
tmp_root = os.path.join(self.root, "image")
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
self.train = train
np_dtype = np.float32
for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))):
Expand Down Expand Up @@ -109,6 +116,8 @@ def __init__(
)
for crop, label in zip(crops, list(text_targets)):
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
else:
self.data.append((
img_path,
Expand Down
12 changes: 11 additions & 1 deletion doctr/datasets/funsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class FUNSD(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -45,6 +46,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
Expand All @@ -55,6 +57,12 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

self.train = train
np_dtype = np.float32

Expand All @@ -63,7 +71,7 @@ def __init__(

# # List images
tmp_root = os.path.join(self.root, subfolder, "images")
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
# File existence check
if not os.path.exists(os.path.join(tmp_root, img_path)):
Expand Down Expand Up @@ -100,6 +108,8 @@ def __init__(
# filter labels with unknown characters
if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
else:
self.data.append((
img_path,
Expand Down
12 changes: 11 additions & 1 deletion doctr/datasets/ic03.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class IC03(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -51,6 +52,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
url, sha256, file_name = self.TRAIN if train else self.TEST
Expand All @@ -62,8 +64,14 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

self.train = train
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

# Load xml data
Expand Down Expand Up @@ -117,6 +125,8 @@ def __init__(
for crop, label in zip(crops, labels):
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
self.data.append((crop, label))
elif detection_task:
self.data.append((name.text, boxes))
else:
self.data.append((name.text, dict(boxes=boxes, labels=labels)))

Expand Down
11 changes: 10 additions & 1 deletion doctr/datasets/ic13.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class IC13(AbstractDataset):
label_folder: folder with all annotation files for the images
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `AbstractDataset`.
"""

Expand All @@ -47,19 +48,25 @@ def __init__(
label_folder: str,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

# File existence check
if not os.path.exists(label_folder) or not os.path.exists(img_folder):
raise FileNotFoundError(
f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
)

self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

img_names = os.listdir(img_folder)
Expand Down Expand Up @@ -95,5 +102,7 @@ def __init__(
crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
for crop, label in zip(crops, labels):
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, box_targets))
else:
self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
42 changes: 26 additions & 16 deletions doctr/datasets/iiit5k.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class IIIT5K(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -45,6 +46,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
Expand All @@ -55,14 +57,20 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

self.train = train

# Load mat data
tmp_root = os.path.join(self.root, "IIIT5K") if self.SHA256 else self.root
mat_file = "trainCharBound" if self.train else "testCharBound"
mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]

self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
Expand All @@ -73,24 +81,26 @@ def __init__(
if not os.path.exists(os.path.join(tmp_root, _raw_path)):
raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")

if use_polygons:
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
box_targets = [
[
[box[0], box[1]],
[box[0] + box[2], box[1]],
[box[0] + box[2], box[1] + box[3]],
[box[0], box[1] + box[3]],
]
for box in box_targets
]
else:
# xmin, ymin, xmax, ymax
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]

if recognition_task:
self.data.append((_raw_path, _raw_label))
elif detection_task:
self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
else:
if use_polygons:
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
box_targets = [
[
[box[0], box[1]],
[box[0] + box[2], box[1]],
[box[0] + box[2], box[1] + box[3]],
[box[0], box[1] + box[3]],
]
for box in box_targets
]
else:
# xmin, ymin, xmax, ymax
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]

# label are casted to list where each char corresponds to the character's bounding box
self.data.append((
_raw_path,
Expand Down
11 changes: 10 additions & 1 deletion doctr/datasets/imgur5k.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class IMGUR5K(AbstractDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `AbstractDataset`.
"""

Expand All @@ -56,17 +57,23 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

# File existence check
if not os.path.exists(label_path) or not os.path.exists(img_folder):
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")

self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
self.train = train
np_dtype = np.float32

Expand Down Expand Up @@ -132,6 +139,8 @@ def __init__(
tmp_img = Image.fromarray(crop)
tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
reco_images_counter += 1
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
else:
self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))

Expand Down
12 changes: 11 additions & 1 deletion doctr/datasets/sroie.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class SROIE(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -52,6 +53,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
url, sha256, name = self.TRAIN if train else self.TEST
Expand All @@ -63,10 +65,16 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+ "To get the whole dataset with boxes and labels leave both parameters to False."
)

self.train = train

tmp_root = os.path.join(self.root, "images")
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))):
Expand Down Expand Up @@ -94,6 +102,8 @@ def __init__(
for crop, label in zip(crops, labels):
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, coords))
else:
self.data.append((img_path, dict(boxes=coords, labels=labels)))

Expand Down
Loading

0 comments on commit 7f6757c

Please sign in to comment.