-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmerge.py
110 lines (86 loc) · 3.5 KB
/
merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import argparse
import os
import hashlib
import pathlib
import shutil
from tqdm import tqdm
import random
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description='Merge LADD',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'-d', '--dir',
required=True,
type=str,
help='Directory containing LADD files',
default='dataset'
)
return parser.parse_args()
def find_datasets(dir: str) -> list[str]:
print(dir)
datasets = []
if 'Annotations' in os.listdir(dir):
return [dir]
for d in os.listdir(dir):
if os.path.isdir(os.path.join(dir, d)):
datasets += find_datasets(os.path.join(dir, d))
return datasets
def merge_datasets(dirs: list[str], outdir: str) -> None:
out_annotations = os.path.join(outdir, 'Annotations')
out_images = os.path.join(outdir, 'ImageSets', 'Main')
out_jpeg_images = os.path.join(outdir, 'JPEGImages')
names = []
os.makedirs(outdir, exist_ok=True)
os.makedirs(out_annotations, exist_ok=True)
os.makedirs(out_images, exist_ok=True)
os.makedirs(out_jpeg_images, exist_ok=True)
for dataset in dirs:
print(f'Merging {dataset}')
annotations = os.listdir(os.path.join(dataset, 'Annotations'))
for annotation in tqdm(annotations):
old_name = pathlib.Path(annotation).stem
old_annotation = os.path.join(dataset, 'Annotations', annotation)
old_image = os.path.join(dataset, 'JPEGImages', old_name + '.jpg')
if not os.path.exists(old_annotation):
print(f'Skipped {old_name}')
continue
if not os.path.exists(old_image):
print(f'Skipped {old_name}')
continue
new_name = hashlib.md5(open(old_image, 'rb').read()).hexdigest()
names.append(new_name)
new_annotation = os.path.join(out_annotations, f'{new_name}.xml')
new_image = os.path.join(out_images, f'{new_name}.jpg')
shutil.copy(old_image, new_image)
annotation_text = open(old_annotation, 'r').read()
annotation_text = annotation_text.replace(
f'<filename>{old_name}</filename>',
f'<filename>{new_name}</filename>'
)
annotation_text = annotation_text.replace(
f'<filename>{old_name.replace('train_', '')}</filename>',
f'<filename>{new_name}</filename>'
)
annotation_text = annotation_text.replace(
f'<filename>{old_name.replace('test_', '')}</filename>',
f'<filename>{new_name}</filename>'
)
open(new_annotation, 'w').write(annotation_text)
random.shuffle(names)
train_names = names[:int(len(names) * 0.75)]
test_names = names[int(len(names) * 0.75):]
open(os.path.join(out_images, 'train.txt'), 'w').write('\n'.join(train_names))
open(os.path.join(out_images, 'trainval.txt'), 'w').write('\n'.join(train_names))
open(os.path.join(out_images, 'val.txt'), 'w').write('\n'.join(test_names))
open(os.path.join(out_images, 'test.txt'), 'w').write('\n'.join(test_names))
def main():
args = parse_args()
datasets = find_datasets(args.dir)
print(f'Merging {len(datasets)} datasets:')
for dataset in datasets:
print(f'\t{dataset}')
merge_datasets(datasets, os.path.join(args.dir, 'full_train_ds'))
if __name__ == '__main__':
main()