-
Notifications
You must be signed in to change notification settings - Fork 5
/
train.py
210 lines (151 loc) · 8.08 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#https://machinelearningmastery.com/how-to-develop-lstm-models-for-multi-step-time-series-forecasting-of-household-power-consumption/
#https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
#https://levelup.gitconnected.com/building-seq2seq-lstm-with-luong-attention-in-keras-for-time-series-forecasting-1ee00958decb
import os,sys, shutil
from datetime import datetime
from itertools import chain
import numpy as np
from numpy import array
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.layers import LSTM, ConvLSTM2D, Dense,BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from models.models import *
from generator import DataGenerator, MergedGenerators
from lstm_utils import * #This is ours.
#These are not necessary on colab
tf.config.experimental.set_memory_growth(tf.config.get_visible_devices()[1], True)
#loading experiment configuration
args=get_args()
config=load_config(args.experiment)
print(config)
np.random.seed(1)#config.random_seed)
randomSeeds=(np.random.random(config.best_of)*100).astype(int)
#we have different datasets for the models that use space weather
print(f"Loading data from {config.train_npy_dataset}")
ionex_npy=config.train_npy_dataset.split(',')
ionexList=[]
for npy_file in ionex_npy:
ionex=np.load(npy_file)
if config.train_time_sampling>1:
ionex=ionex[::config.train_time_sampling,...] #2h step
#Resizing to 72x72
ionex=np.concatenate((ionex[:,:,:-1,:],ionex[:,-1:,:-1,:]),axis=1)
ionex=getDataSubset(ionex,config.experiment_name)
ionexList.append(ionex)
#scaling
ionexStack=np.concatenate(ionexList)
parameters = { "mean" : ionexStack.mean(axis=(0,1,2)) , "max": ionexStack.max(axis=(0,1,2)), "min": ionexStack.min(axis=(0,1,2)), "input_t_steps": config.lag_window}
ionexList=[scaleForward(ionex,parameters) for ionex in ionexList]
del ionexStack
#We need to save these scaling parameters for prediction
print("Saving scaling information on parameters.py. If you change the input data, please remove the file and retrain.")
with open(getModelFilePath(config.experiment_name, f"params.py"),'w') as f:f.write(repr(parameters))
exp_val_rmse=[]
for experimentNumber, randomSeed in enumerate(randomSeeds): #this represents how many times we are going to train the network
print(f"Starting experiment {experimentNumber}")
try:
model= eval(config.model)
except:
print(f"Error trying to load the model chosen in {args.config}")
sys.exit()
## HYPER PARAMETERS ##
batch_size=config.batch_size
input_t_steps=config.lag_window
#output_t_steps=config.prediction_window#12#24
if config.prediction=='seq2one':
output_t_steps=1
else:
output_t_steps=config.prediction_window
print("Input shape: ",ionex.shape)
#The data generators apply the sliding window for the time frames
training_generators=[]
validation_generators=[]
for ionex in ionexList:
training_generators.append(DataGenerator(ionex, batch_size=batch_size, nstepsin=config.lag_window, nstepsout=output_t_steps,sample_rate=config.resample_rate, validation=False, val_split=0.2, random_state=23))
validation_generators.append(DataGenerator(ionex, batch_size=batch_size, nstepsin=config.lag_window, nstepsout=output_t_steps,sample_rate=config.resample_rate, validation=True, val_split=0.2, random_state=23))
training_generator=MergedGenerators(training_generators)
validation_generator=MergedGenerators(validation_generators)
#print(f"Checking intersections: {list(set(validation_generator.list_IDs) & set(training_generator.list_IDs))}")
print(f"Training maps: {training_generator[0][0].shape}")
print(f"Validation maps: {validation_generator[0][0].shape}")
print(f"Training set: {training_generator.count()}")
print(f"Validation set: {validation_generator.count()}")
batch_shape_x=training_generator[0][0][0].shape
batch_shape_y=training_generator[0][1][0].shape
print(f"batch_shape_x={batch_shape_x}")
print(f"batch_shape_y={batch_shape_y}")
model=model(batch_shape_x,nstepsout=output_t_steps, filters=config.filters) #initializing model
print(model.summary())
plot_model(model, to_file=getModelFilePath(config.experiment_name, "model.png"), show_shapes=True, show_layer_names=True)
model.compile(optimizer=config.optimizer, jit_compile=True, loss=config.loss,metrics=['mean_absolute_error', 'mean_squared_error'])
print("Model fitting")
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001,patience=10, restore_best_weights=True, verbose = 1, mode="min") #0.0001 / 10
#checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(getModelFileName(config.experiment_name), monitor='val_loss', verbose=1, save_best_only=True)
logdir='./logs/'+config.experiment_name
tb_callback = tf.keras.callbacks.TensorBoard(logdir, update_freq='epoch')
from tensorboard.plugins.hparams import api as hp
#model.adapt(training_generator)
start = datetime.now()
print(f'Started training: {start}')
history=model.fit(training_generator,validation_data=validation_generator, epochs=config.num_epochs, verbose=2, callbacks = [earlystopping,tb_callback])
end = datetime.now()
print(f'Ended training: {end}')
totaltime= end - start
currentModelFname=getModelFileName(config.experiment_name,experimentNumber)
model.save(currentModelFname)
val_results = model.evaluate(validation_generator,batch_size=config.batch_size,verbose=2)
train_results = model.evaluate(training_generator,batch_size=config.batch_size,verbose=2)
print(val_results)
print(f"Time spent training: {totaltime}")
memoryUsed=getNvidiaSmiMem()
print(f"Memory used: {memoryUsed}")
s=getScaleFromParameters(parameters)[0]
val_rmse=s*np.sqrt(val_results[2])
if experimentNumber==0:
best_val_rmse=val_rmse
bestExp=0
if val_rmse<= best_val_rmse:
print("Saving training results because this is currently the best model.")
best_val_rmse=val_rmse
bestExp=experimentNumber
#saving macro results
#resultsFile="output/results.py"
resultsFile=getModelFilePath(config.experiment_name,"results.py")
if not os.path.exists(resultsFile):
results={}
else:
with open(resultsFile, 'r') as f: results = eval(f.read())
expId=f"{config.experiment_name}_{experimentNumber}"
if not expId in results.keys():
modelResults={}
else:
modelResults=results[expId]
modelResults["time"]= totaltime.total_seconds()
modelResults["memory"]= memoryUsed
modelResults["epochs"]= len(history.history['loss'])
modelResults["train_rmse"]= s*np.sqrt(train_results[2])
modelResults["train_mae"]= s*train_results[1]
modelResults["val_rmse"]= val_rmse
modelResults["val_mae"]= s*val_results[1]
results[expId]=modelResults
with open(resultsFile,'w') as f:f.write(repr(results))
np.savez(getModelFilePath(config.experiment_name,f'history{experimentNumber}'),h=history.history)
#history=np.load('my_history.npy',allow_pickle='TRUE').item() #this would read
scale=getScaleFromParameters(parameters)
plotHistory(history,getModelFilePath(config.experiment_name,f'history{experimentNumber}.png'),scale=scale)
#finding the best experiment
print(f"Best experiment: {bestExp}. Restoring best model.")
shutil.copy(getModelFileName(config.experiment_name,bestExp),getModelFileName(config.experiment_name))
with tf.summary.create_file_writer(logdir,name=config.experiment_name).as_default():
hparams = {
'model': config.model,
'parameters': sum([v.shape.num_elements() for v in model.trainable_variables]),
'batch_size': batch_size,
'nstepsin': config.lag_window,
'nstepsout': output_t_steps,
'time_training': totaltime.total_seconds(),
}
hp.hparams(hparams) # record the values used in this trial