-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcreate_dataset.py
72 lines (50 loc) · 2.12 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Author: Mauro Mendez.
Date: 02/11/2020.
File to create a single time split (training/validation/testing).
The dataset is expected to be in a folder following the structure:
data/
cross_validation/ (The folder you're currently in)
dataset/
0/
1/
preprocessing/
You must change the logic to read your dataset in case it follows another structure.
The bottom section of this code expects a list with the absoulte path to the images
and a list with their labels.
"""
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
#! /////////// Change code to read your dataset //////
SPLIT_CHAR = '/' # Change for \\ if you're using Windows
DATASET_FOLDER = '..' + SPLIT_CHAR + 'dataset' + SPLIT_CHAR # Change '..' for an absolute path
IMAGE_EXTENSION = '*.png' # Change for the extension of your images
print('Reading Dataset...')
# Get absolute paths to all images in dataset
images = glob.glob(DATASET_FOLDER + '*' + SPLIT_CHAR + IMAGE_EXTENSION)
# Get labels per image
labels = [int(img.split(SPLIT_CHAR)[-2]) for img in images]
print("Splitting dataset...")
# Split dataset
train_ratio = 0.75
val_ratio = 0.1
test_ratio = 0.15
train_x, test_x, train_y, test_y = train_test_split(\
images, labels,\
train_size=train_ratio,\
stratify=labels)
val_x, test_x, val_y, test_y = train_test_split(\
test_x, test_y,\
test_size=test_ratio/(test_ratio+val_ratio),\
stratify=test_y)
print("Saving datasets...")
# Save the splits on csv files
dataset_df = pd.DataFrame({'ID_IMG':images, 'LABEL': labels})
dataset_df.to_csv('../full_dataset_labels.csv')
train_df = pd.DataFrame({'ID_IMG':train_x, 'LABEL': train_y})
train_df.to_csv('../train_labels.csv')
val_df = pd.DataFrame({'ID_IMG':val_x, 'LABEL': val_y})
val_df.to_csv('../val_labels.csv')
test_df = pd.DataFrame({'ID_IMG':test_x, 'LABEL': test_y})
test_df.to_csv('../test_labels.csv')