-
Notifications
You must be signed in to change notification settings - Fork 0
/
TestDatasetGenerator.py
252 lines (221 loc) · 10.9 KB
/
TestDatasetGenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
from torch.utils.data import Dataset
import torch
import random
import numpy as np
from matplotlib import pyplot as plt
import math
import pickle
import os
"""
This class is used for creating random numpy arrays where the size of one dimension follows one of 5 possible distributions
(Gaussian, Exponential_Right, Exponential_Left, Random, Equal)
Also the numpy arrays are saved to files on the disk.
This was used to create dummy data for evaluating the data samplers.
"""
class TestDatasetGenerator:
def __init__(self, num_items=100, distribution="Gaussian", min_length=2, max_length=60, train_events=1, feature_dim=4, root_dir="./"):
"""
num_items: number of numpy files to create
distribution: the distribution that the length of the drawn samples will follow
min_length: minimum size of varying dimension
max_length: maximum size of varying dimension
train_events: number of arrays per numpy file
root_dir: where to save the numpy array files to
"""
self.num_items = num_items
self.data = []
self.labels = []
self.distribution = distribution if distribution in ["Gaussian", "Exponential_Right", "Exponential_Left", "Random", "Equal"] else "Random"
#checking whether chosen min and max lengths make sense
if min_length < max_length:
self.min_length = min_length
self.max_length = max_length
else:
print("Min length of samples must be smaller than max length")
self.min_length = 2
self.max_length = 60
self.train_events = train_events
self.features_dim = feature_dim
if os.path.isdir(root_dir):
self.root_dir = root_dir
if not os.path.isdir(os.path.join(self.root_dir, self.distribution)):
#creating the directory to save the files to if it does not exist yet
os.mkdir(os.path.join(self.root_dir, self.distribution))
self.save_dir = os.path.join(self.root_dir, self.distribution)
# determining the distibution to use
if self.distribution == "Gaussian":
self.create_gaussian()
elif self.distribution == "Random":
self.create_random()
elif self.distribution == "Equal":
self.create_equal()
elif self.distribution == "Exponential_Right":
self.create_exponential_right()
elif self.distribution == "Exponential_Left":
self.create_exponential_left()
else:
#if distribution does not match any of the previous ones, use Gaussian
print("The chosen data distribution is invalid. Using Gaussian distribution instead.")
self.create_gaussian()
assert len(self.length_list) == self.num_items
self.save_length_list()
#these lines can be uncommented if you want to create numpy files from a length list (list that contains the different lengths
#of the arrays)
#self.length_list = []
#self.read_length_list(path="/media/johanna/Volume/Studium/Semester_4/Datamanagement/data/Gaussian_1000/length_list_file")
#print(self.length_list)
self.visualize_distribution(verbose=True)
self.create_tensors()
def create_gaussian(self):
"""
This function creates a list of lengths, the lengths are drawn from a Gaussian distribution.
Samples are drawn until we have enough samples whose length is within [min_range, max_range]
"""
center = (int)((self.max_length - self.min_length) / 2) + self.min_length
scale = math.sqrt(((self.max_length - self.min_length) / 2)) + 4
# from https://stackoverflow.com/questions/16471763/generating-numbers-with-gaussian-function-in-a-range-using-python
length_list = []
for _ in range(self.num_items):
a = np.random.default_rng().normal(center, scale, 1)
# if drawn length is smaller than desired min length or greater than max length draw the sample again
while a < self.min_length or a > self.max_length + 1:
a = random.gauss(center, scale)
length_list.append(int(a))
print("max is ", max(length_list))
print("min is ", min(length_list))
self.length_list = length_list
def create_exponential_right(self):
"""
This function creates a list of lengths, the lengths are drawn from a Exponential Right distribution.
That is a distribution where most lengths are large, and only few are small.
Samples are drawn until we have enough samples whose length is within [min_range, max_range]
"""
scale = int((self.max_length - self.min_length) / 2) - 4
length_list = []
for _ in range(self.num_items):
a = np.random.default_rng().exponential(scale, size=1)
while a < self.min_length or a > self.max_length + 1:
a = np.random.exponential(scale, size=1)
length_list.append(int(a))
length_list = [-it + self.max_length + self.min_length for it in length_list]
print("max is ", max(length_list))
print("min is ", min(length_list))
self.length_list = length_list
def create_exponential_left(self):
"""
This function creates a list of lengths, the lengths are drawn from a Exponential Left distribution.
That is a distribution where most lengths are small, and only few are large.
Samples are drawn until we have enough samples whose length is within [min_range, max_range]
"""
scale = int((self.max_length - self.min_length) / 2) - 4
length_list = []
for _ in range(self.num_items):
a = np.random.default_rng().exponential(scale, size=1)
while a < self.min_length or a > self.max_length + 1:
a = np.random.exponential(scale, size=1)
length_list.append(int(a))
print("max is ", max(length_list))
print("min is", min(length_list))
self.length_list = length_list
def create_equal(self):
"""
This function creates a list of lengths, the lengths are drawn from an Equal distribution.
That is a distribution where all lengths are the same.
Samples are drawn until we have enough samples whose length is within [min_range, max_range]
"""
length_list = []
for i in range(0, self.num_items):
length_list.append((i % (self.max_length + 1 - self.min_length)) + self.min_length)
print(" max is ", max(length_list))
print(" min is ", min(length_list))
self.length_list = length_list
def create_random(self):
"""
This function creates a list of lengths, the lengths are drawn from a Random distribution.
Samples are drawn until we have enough samples whose length is within [min_range, max_range]
"""
length_list = []
for i in range(self.num_items):
length_list.append(random.randint(self.min_length, self.max_length))
print("max is ", max(length_list))
print("min is ", min(length_list))
self.length_list = length_list
def visualize_distribution(self, verbose=True):
"""
This function visualizes the distribution of the drawn lengths.
"""
plt.xlim([0, 65])
bins = len(set(self.length_list))
plt.hist(self.length_list, bins = len(set(self.length_list)), alpha=0.5)
plt.title("Distribution: " + self.distribution)
plt.xlabel("Length of sample")
plt.ylabel("count")
plt.savefig(os.path.join(self.save_dir, "Distribution_visualization.jpg"))
if verbose:
plt.show()
def create_tensors(self):
"""
This function creates the random numpy arrays, according to the length list.
It also saves the created arrays to the disk.
"""
lca_matrices = []
leaves_list = []
for mode in ["train", "val"]:
counter = 0
first = True
for idx, len in enumerate(self.length_list):
lca_matrix = np.random.rand(self.train_events, len, len)
leaves = np.random.rand(self.train_events, len, self.features_dim)
lca_matrices = [lca_matrix]
leaves_list = [leaves]
self._save_to_drive(lca_matrices, "lcas", mode, counter)
self._save_to_drive(leaves_list, "leaves", mode, counter)
counter += 1
'''
if idx % self.train_events == 0 and not first:
self._save_to_drive(lca_matrices, "lcas", mode, counter)
self._save_to_drive(leaves_list, "leaves", mode, counter)
lca_matrices = []
leaves_list = []
counter += 1
'''
first = False
self._save_to_drive(lca_matrices, "lcas", mode, counter)
self._save_to_drive(leaves_list, "leaves", mode, counter)
def _save_to_drive(self, tensor_list, identifier, mode, counter, path="./"):
"""
This function saves a list of tensors to the disk.
tensor_list: List of tensors to save to disk
identifier: leaves or lcas
mode: train or validation
counter: number identifying the numpy file
path: directory to save numpy files to
"""
save_path = self.save_dir if os.path.isdir(self.root_dir) else path
for tensor in tensor_list:
np.save(os.path.join(save_path, f"{identifier}_{mode}.{counter}.npy"), tensor)
def save_length_list(self, path="./"):
"""
This function saves the drawn length list to a file so that it is possible
to create the numpy arrays or the distribution visualization again just using
this file.
"""
save_path = self.save_dir if os.path.isdir(self.root_dir) else path
save_path = os.path.join(save_path, "length_list_file")
with open(save_path, "wb") as fp:
pickle.dump(self.length_list, fp)
self.length_list_path = save_path
def read_length_list(self, path="./"):
with open(path, "rb") as fp:
self.length_list = pickle.load(fp)
def main():
root = "/directory/to/save/numpy/files/in"
#example usage for saving numpy files:
test = TestDatasetGenerator (num_items=10000, root_dir=root, distribution="Gaussian", min_length=2, max_length=60)
test = TestDatasetGenerator (num_items=10000, root_dir=root, distribution="Exponential_Right", min_length=2, max_length=60)
test = TestDatasetGenerator (num_items=10000, root_dir=root, distribution="Exponential_Left", min_length=2, max_length=60)
test = TestDatasetGenerator (num_items=10000, root_dir=root, distribution="Random", min_length=2, max_length=60)
test = TestDatasetGenerator (num_items=10000, root_dir=root, distribution="Equal", min_length=2, max_length=60)
return 0
if __name__ == "__main__":
main()