-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_load.py
120 lines (103 loc) · 4.51 KB
/
data_load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from tqdm import tqdm
import os
import cv2
import json
import sys
import os
def load_data_testcase(PATH):
X= []
Y = []
for file in tqdm(os.listdir(PATH)):
sample = file.split(".")[0]
format = file.split(".")[1]
if format == "png":
X.append(cv2.imread(f"{PATH}{sample}.png", 0))
with open(f"{PATH}{sample}.txt", "r") as agnosticfile:
string_array = agnosticfile.readline().split(" ")
for idx, token in enumerate(string_array):
string_array[idx] = token.strip()
Y.append(string_array)
return X, Y
def load_data_jsonMuret(PATH, encoding):
X = []
Y = []
for file in tqdm(os.listdir(f"{PATH}")):
markedForDelete = False
with open(f"{PATH}/{file}") as jsonfile:
if file.split(".")[-1] == "json":
data = json.load(jsonfile)
image = cv2.imread(f"{PATH}/{data['filename']}", 0)
bbox = data["pages"][0]['bounding_box']
image = image[bbox["fromY"]:bbox["toY"], bbox["fromX"]:bbox["toX"]]
sequence = []
for region in data["pages"][0]["regions"]:
if region["type"] == "staff":
if "symbols" in region: # Avoid empty staves
for symbol in region["symbols"]:
if encoding == "sseq":
sequence.append(f"{symbol['agnostic_symbol_type']}")
sequence.append(f"{symbol['position_in_staff']}")
else:
sequence.append(f"{symbol['agnostic_symbol_type']}:{symbol['position_in_staff']}")
if sequence:
X.append(image)
Y.append(sequence)
else:
markedForDelete = True
if markedForDelete:
os.remove(f"{PATH}/{file}")
os.remove(f"{PATH}/{data['filename']}")
return X, Y
def load_data_testcase_im2s(PATH):
X= []
Y = []
for file in tqdm(os.listdir(PATH)):
sample = file.split(".")[0]
format = file.split(".")[1]
if format == "png":
X.append(cv2.imread(f"{PATH}{sample}.png", 0))
with open(f"{PATH}{sample}.txt", "r") as agnosticfile:
string_array = agnosticfile.readline().split(" ")
for idx, token in enumerate(string_array):
string_array[idx] = token.strip()
Y.append(['<sos>'] + [token for token in string_array] + ['<eos>'])
return X, Y
def load_data_jsonMuret_im2s(PATH):
X = []
Y = []
for file in tqdm(os.listdir(f"{PATH}")):
markedForDelete = False
with open(f"{PATH}/{file}") as jsonfile:
if file.split(".")[-1] == "json":
data = json.load(jsonfile)
image = cv2.imread(f"{PATH}/{data['filename']}", 0)
bbox = data["pages"][0]['bounding_box']
image = image[bbox["fromY"]:bbox["toY"], bbox["fromX"]:bbox["toX"]]
sequence = []
for region in data["pages"][0]["regions"]:
if region["type"] == "staff":
if "symbols" in region: # Avoid empty staves
for symbol in region["symbols"]:
sequence.append(f"{symbol['agnostic_symbol_type']}:{symbol['position_in_staff']}")
if sequence:
X.append(image)
Y.append(['<sos>'] + [token for token in sequence] + ['<eos>'])
else:
markedForDelete = True
if markedForDelete:
os.remove(f"{PATH}/{file}")
os.remove(f"{PATH}/{data['filename']}")
return X, Y
def load_data_muret(IMG_PATH, AGNOSTIC_PATH):
X = []
Y = []
for folder in tqdm.tqdm(os.listdir(AGNOSTIC_PATH)):
for file in os.listdir(f"{AGNOSTIC_PATH}/{folder}"):
with open(f"{AGNOSTIC_PATH}/{folder}/{file}") as jsonfile:
data = json.load(jsonfile)
image = cv2.imread(f"{IMG_PATH}/{folder}/masters/{data['filename']}", 0)
bbox = data["pages"][0]['bounding_box']
image = image[bbox["fromY"]:bbox["toY"], bbox["fromX"]:bbox["toX"]]
X.append(image)
Y.append(data)
return X, Y