-
Notifications
You must be signed in to change notification settings - Fork 2
/
object-detection-main.py
309 lines (258 loc) · 17.5 KB
/
object-detection-main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import manga109api_custom
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from solver import Solver
from torch.utils.data import DataLoader
from utils import load_all_images
from datasetManga109 import CustomDataset, get_train_transform, get_valid_transform, get_train_transform_aug
from metrics import calculate_mAP, calculate_mAP_authors
import os
from inference import get_prediction, load_inference_model
import argparse
def get_args():
argParser = argparse.ArgumentParser()
# general network parameters
argParser.add_argument("-mode", "--mode", type = int, nargs = '?', const = 1, default = 0, help = "0 for training, 1 for loading a checkpoint, 2 for inference. Default is 0 (training mode)")
argParser.add_argument("-lr", "--learning_rate", type = float, nargs = '?', const = 1, default = 0.001, help = "Learning rate parameter for optimizer. Default is: 0.001")
argParser.add_argument("-bs", "--batch_size", type = int, nargs = '?', const = 1, default = 8, help = "Batch size parameter. Default is: 8")
argParser.add_argument("-add_auth", "--add_authors", type = int, nargs = '?', const = 1, default = 0, help = "1 if you want to include authors, 0 otherwise. Default is: 0")
argParser.add_argument("-res", "--resize_to", type = int, nargs = '?', const = 1, default = 512, help = "Resize dimensions of the input images. Default is: 512")
argParser.add_argument("-num_epochs", "--num_epochs", type = int, nargs = '?', const = 1, default = 20, help = "Number of epochs for training (Early stopping is implemented). Default is: 20")
argParser.add_argument("-min_ep", "--num_min_epochs", type = int, nargs = '?', default = 1, help = "Minimum number of epochs before using early stopping.")
argParser.add_argument("-fn", "--file_name", type = str, nargs = '?', const = 1, default = "model.pth", help = "Name of the file where the trained model will stored")
argParser.add_argument("-model", "--model", type = str, nargs = '?', const = 1, default="fasterrcnn", help = "Name of the model. Available models are: FasterRCNN (fasterrcnn), RetinaNet (retinanet), SSD300 (ssd). Default is fasterrcnn")
argParser.add_argument("-bb", "--backbone", type = str, nargs = '?', const = 1, default = "resnet50v2", help = "Name of the backbone for a FasterRCNN model. Available backbones are: resnet50, resnet50v2, mobilenet. Default is resnet50v2")
argParser.add_argument("-opt", "--optimizer", type = str, nargs = '?', const = 1, default = "SGD", help = "Name of the optimzer. Available optimizers are: SGD, Adam. Default is SGD")
argParser.add_argument("-checkpoint_path", "--checkpoint_path", type = str, nargs = '?', const = 1, default = "./", help = "Checkpoint path. Default is ./")
argParser.add_argument("-seed", "--seed", type = int, nargs= '?', const = 1, default = 42, help = "Random seed for dataset division. Default is 42")
argParser.add_argument("-print_every", "--print_every", type = int, nargs= '?', const = 1, default = 250, help = "Parameter used to determine every how many iterations to save the loss on the tensorboard. Default is 250.")
argParser.add_argument("-early_stopping", "--early_stopping", type = int, nargs = '?', const = 1, default = 1, help = "Parameter that controls early stopping. 0 = no early stopping. Values greater than 0 represent the value of patience. Eg: 1 = early stopping with patience 1")
argParser.add_argument("-pretrained", "--pretrained", type = int, nargs= '?', const = 1, default = 1, help = "Use pretrained model.")
argParser.add_argument("-dataset", "--dataset_dir", type = str, nargs = '?', const = 1, default = "Manga109/Manga109_released_2021_12_30", help = "Directory path of dataset")
argParser.add_argument("-inference_path", "--inference_path", type = str, nargs = '?', const = 1, default = "./inference_images", help = "Path where the images for inference are saved.")
argParser.add_argument("-dataset_transform", "--dataset_transform", type = int, nargs = '?', const = 1, default = 0, help = "1 if you want to use transformations, 0 otherwise.")
argParser.add_argument("-save_pred", "-save_predictions", type=float, nargs = '?', const = 1, default = 0, help= "1 if you want to save predictions on tensorboard, 0 otherwise")
argParser.add_argument("-writer_path", "-writer_path", type=str, nargs = '?', const = 1, default = "./runs/experiments", help= "The path for Tensorboard metrics")
argParser.add_argument("-det_thresh", "--detection_threshold", type = float, nargs = '?', const = 1, default = 0.50, help = "Detection threshold for the metric computation. Default is: 0.50")
argParser.add_argument("-split", "--split", type = float, nargs = '?', const = 1, default = 0.20, help = "The value used to split the dataset into train and validation subsets. Default is: 0.20 (80% training and 20% validation).")
argParser.add_argument("-map_authors", "--map_authors", type = int, nargs = '?', const = 1, default = 1, help = "Calculate mAP for author classification (available only if the author classification is enabled).")
# classes customization
argParser.add_argument("-body", "--body", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to train the model to recognize 'body' class, 0 otherwise.")
argParser.add_argument("-face", "--face", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to train the model to recognize 'face' class, 0 otherwise.")
argParser.add_argument("-frame", "--frame", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to train the model to recognize 'frame' class, 0 otherwise.")
argParser.add_argument("-text", "--text", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to the model to recognize 'text' class, 0 otherwise.")
# Specific FasterRCNN parameters
argParser.add_argument("-trainable_backbone_layers", "--trainable_backbone_layers", type = int, nargs = '?', const = 1, default = 3, help = "Number of trainable (not frozen) layers starting from final block. Valid values are between 0 and 5.")
# anchors customization
# sizes
argParser.add_argument("-custom_anch", "--custom_anchors", type = int, nargs = '?', const = 1, default = 0, help = "1 if you want to add custom anchors, 0 otherwise")
argParser.add_argument("-size8", "--size8", type = int, nargs = '?', const = 1, default = 0, help = "1 if you want to add size 8 for anchors, 0 otherwise")
argParser.add_argument("-size16", "--size16", type = int, nargs = '?', const = 1, default = 0, help = "1 if you want to add size 16 for anchors, 0 otherwise")
argParser.add_argument("-size32", "--size32", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add size 32 for anchors, 0 otherwise")
argParser.add_argument("-size64", "--size64", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add size 64 for anchors, 0 otherwise")
argParser.add_argument("-size128", "--size128", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add size 128 for anchors, 0 otherwise")
argParser.add_argument("-size256", "--size256", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add size 256 for anchors, 0 otherwise")
argParser.add_argument("-size512", "--size512", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add size 512 for anchors, 0 otherwise")
# aspect ratios
argParser.add_argument("-ar05", "--ar05", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add aspect ratio 1:1 for anchors, 0 otherwise")
argParser.add_argument("-ar1", "--ar1", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add aspect ratio 1:2 for anchors, 0 otherwise")
argParser.add_argument("-ar2", "--ar2", type = int, nargs = '?', const = 1, default = 1, help = "1 if you want to add aspect ratio 2:1 for anchors, 0 otherwise")
# thresholds for the RPN network
argParser.add_argument("-rpn_nms_th", "--rpn_nms_threshold", type = float, nargs = '?', const = 1, default = 0.7, help = "NMS threshold used for postprocessing the RPN proposals. Deafult is: 0.7")
argParser.add_argument("-rpn_fg_iou_th", "--rpn_fg_iou_threshold", type = float, nargs = '?', const = 1, default = 0.7, help = "Minimum IoU between the anchor and the GT box so that they can be considered as positive during training of the RPN. Deafult is: 0.7")
argParser.add_argument("-rpn_bg_iou_th", "--rpn_bg_iou_threshold", type = float, nargs = '?', const = 1, default = 0.3, help = "Maximum IoU between the anchor and the GT box so that they can be considered as negative during training of the RPN. Deafult is: 0.7")
argParser.add_argument("-rpn_score_th", "--rpn_score_threshold", type = float, nargs = '?', const = 1, default = 0.0, help = "During inference, only return proposals with a classification score greater than rpn_score_thresh. Default is: 0.0")
# thresholds for the classification newtwork
argParser.add_argument("-box_nms_th", "--box_nms_threshold", type = float, nargs = '?', const = 1, default = 0.5, help = "NMS threshold for the prediction head. Used during inference. Default is: 0.5")
argParser.add_argument("-box_fg_iou_th", "--box_fg_iou_threshold", type = float, nargs = '?', const = 1, default = 0.5, help = "Minimum IoU between the proposal and the GT box so that they can beconsidered as positive during training of the classification head. Deafult is: 0.5")
argParser.add_argument("-box_bg_iou_th", "--box_bg_iou_threshold", type = float, nargs = '?', const = 1, default = 0.5, help = "Maximum IoU between the proposal and the GT box so that they can beconsidered as negative during training of the classification head. Deafult is: 0.5")
argParser.add_argument("-box_score_th", "--box_score_threshold", type = float, nargs = '?', const = 1, default = 0.05, help = "During inference, only return proposals with a classification score greater than box_score_thresh. Default is: 0.05")
# number of detection per image
argParser.add_argument("-box_detections", "--box_detections_per_img", type = int, nargs = '?', const = 1, default = 100, help = "Maximum number of detections per image, for all classes. Default is: 100")
return argParser.parse_args()
def check_args_integrity(args):
if args.detection_threshold < 0 or args.detection_threshold > 1:
print("Error. Detection threshold (detection_threshold) must be between 0 and 1.")
os._exit(1)
if args.num_epochs < 0:
print("Error. Number of epochs (num_epochs) must be a positive number.")
os._exit(1)
if args.num_min_epochs < 0:
print("Error. Minimum number of epochs can't be a negative value")
if args.batch_size < 0:
print("Error. Batch size (batch_size) must be a positive number.")
os._exit(1)
if args.print_every < 0:
print("Error. The number of iterations to print the tensorboard stats must be positive.")
os._exit(1)
if args.optimizer != "SGD" and args.optimizer != "Adam":
print("Error. The optimizer must be SGD or Adam")
os._exit(1)
if args.model != "fasterrcnn" and args.model != "retinanet" and args.model != "ssd":
print("Error. The model name must be fasterrcnn, retinanet or ssd")
os._exit(1)
if args.backbone != "resnet50" and args.backbone != "resnet50v2" and args.backbone != "mobilenet":
print("Error. Backbone name must be resnet50, resnet50v2 or mobilenet")
os._exit(1)
if args.resize_to <= 0:
print("Error. resize_to must be a positive number")
os._exit(1)
if args.learning_rate <= 0:
print("Error. Learning rate must be positive")
os._exit(1)
if args.early_stopping < 0:
print("Error. Early stopping value must be positive")
os._exit(1)
if args.body != 1 and args.body !=0:
print("Error. Class selector must be 1 or 0")
os._exit(1)
if args.face != 1 and args.face !=0:
print("Error. Class selector must be 1 or 0")
os._exit(1)
if args.frame != 1 and args.frame !=0:
print("Error. Class selector must be 1 or 0")
os._exit(1)
if args.text != 1 and args.text !=0:
print("Error. Class selector must be 1 or 0")
os._exit(1)
if args.size32 == 0 and args.size64 == 0 and args.size128 == 0 and args.size256 == 0 and args.size512 == 0:
print("Insert at least one size for anchors")
os._exit(1)
if args.ar05 == 0 and args.ar1 == 0 and args.ar2 == 0:
print("Insert at least one aspect ratio for anchors")
os._exit(1)
"""
If model is not a FasterRCNN and the user adds the authors, add_authors is automatically set to 0
because the authors classification is not supported with the other models
"""
if args.add_authors and args.model != "fasterrcnn":
args.add_authors = 0
if args.model != "fasterrcnn":
args.backbone = "-"
if args.add_authors == 0 and args.map_authors == 1:
args.map_authors = 0
def main(args):
check_args_integrity(args)
RESIZE_TO=args.resize_to
BATCH_SIZE=args.batch_size
NUM_EPOCHS=args.num_epochs
LEARNING_RATE=args.learning_rate
NUM_WORKERS=0
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# create class list
CLASSES = ["__background__"]
if args.body:
CLASSES.append("body")
if args.face:
CLASSES.append("face")
if args.frame:
CLASSES.append("frame")
if args.text:
CLASSES.append("text")
NUM_CLASSES=len(CLASSES)
if args.mode == 2:
# inference mode
inference_model = load_inference_model(args.file_name, DEVICE)
get_prediction(inference_model = inference_model, classes = CLASSES, args = args)
else:
print("---- TRAINING PARAMETERS ----")
print("Batch size: " + str(BATCH_SIZE))
print("Resize: " + str(RESIZE_TO))
print("Model: " + args.model)
print("Backbone: " + args.backbone)
print("Pretrained: " + ("Yes" if args.pretrained else "No"))
print("Optimizer: " + str(args.optimizer))
print("Learning rate: " + str(LEARNING_RATE))
print("Epochs: " + str(NUM_EPOCHS))
print("Authors: " + ("Included" if args.add_authors else "Not included"))
print("Classes: " + str(NUM_CLASSES))
print("Data aug: " + ("Yes" if args.dataset_transform else "No"))
if args.model=="fasterrcnn":
print("Trainable backbone layers: " + str(args.trainable_backbone_layers))
print("-----------------------------")
'''
Reading the authors from a dedicated file with ID, NAME and BOOK TITLE
Information about the authors is also used for the division of the dataset
'''
file_path = "autori.txt"
data = pd.read_csv(file_path, sep="\t", header=None)
data.columns = ["id", "author", "title"]
ids = data["id"].tolist()
authors = data["author"].tolist()
titles = data["title"].tolist()
AUTHORS = authors.copy()
AUTHORS.insert(0, "background")
NUM_AUTHORS = len(AUTHORS)
authors_list = []
# Preparing a list of authors to be given as input to the parser
for id, author, title in zip(ids, authors, titles):
temp = []
temp.append(id)
temp.append(author)
temp.append(title)
authors_list.append(temp)
manga109_root_dir = args.dataset_dir
# Custom parser from manga109api_custom
p = manga109api_custom.Parser(root_dir=manga109_root_dir, authors_list=authors_list)
images=[]
authors_labels = []
images, authors_labels = load_all_images(p, CLASSES)
'''
Dataset split in training and test.
Author labels are used to divide the dataset in order to have a dataset split that takes into account the author.
'''
train_images, val_images, _ , _ = train_test_split(images, authors_labels, shuffle=True, stratify=authors_labels, test_size=args.split, random_state=args.seed)
# convert list in Pandas DataFrame
df_train = pd.DataFrame(train_images, columns=["path", "annotation"])
df_val = pd.DataFrame(val_images, columns=["path", "annotation"])
if args.dataset_transform:
train_dataset = CustomDataset(df_train, RESIZE_TO, RESIZE_TO, CLASSES, get_train_transform_aug())
else:
train_dataset = CustomDataset(df_train, RESIZE_TO, RESIZE_TO, CLASSES, get_train_transform())
val_dataset = CustomDataset(df_val, RESIZE_TO, RESIZE_TO, CLASSES, get_valid_transform())
def collate_fn(batch):
"""
To handle the data loading as different images may have different number
of objects and to handle varying size tensors as well.
"""
return tuple(zip(*batch))
print(f"Number of training images: {len(train_dataset)}")
print(f"Number of validation images: {len(val_dataset)}")
train_loader = DataLoader(
train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=NUM_WORKERS,
collate_fn=collate_fn
)
val_loader = DataLoader(
val_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=NUM_WORKERS,
collate_fn=collate_fn
)
writer = SummaryWriter('./runs/experiments')
"""
solver
"""
solver = Solver(train_data_loader=train_loader, val_data_loader=val_loader, device=DEVICE, writer=writer, args=args, n_classes=NUM_CLASSES, n_authors=NUM_AUTHORS)
if args.mode == 1:
# load a checkpoint
solver.load_model(DEVICE)
"""
training and metrics computation
"""
if args.add_authors:
solver.train_with_authors()
else:
solver.train()
if args.map_authors:
calculate_mAP_authors(model=solver.model, classes=CLASSES, device=solver.device, val_loader=solver.val_loader, args = args, writer=writer, draw_prediction=args.save_pred)
else:
calculate_mAP(model=solver.model, classes=CLASSES, device=solver.device, val_loader=solver.val_loader, args = args, writer=writer, draw_prediction=args.save_pred)
if __name__=="__main__":
args = get_args()
main(args)