-
Notifications
You must be signed in to change notification settings - Fork 0
/
proj2_2_linear_regression_b.py
158 lines (129 loc) · 6.07 KB
/
proj2_2_linear_regression_b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 10 23:41:52 2021
@author: changai
"""
from proj1_1_load_data import *
from matplotlib.pylab import (figure, semilogx, loglog, xlabel, ylabel, legend,
title, subplot, show, grid)
import numpy as np
from scipy.io import loadmat
import sklearn.linear_model as lm
from sklearn import model_selection
from toolbox_02450 import rlr_validate
y = X[:,9].astype('float')
y = y.squeeze()
X = X[:,range(0,9)].astype(float) ## select only metereologcal datas
N,M = X.shape
X = X - np.ones((N,1)) * X.mean(axis=0)
#print(X1.mean(axis=0)) #obtain mean value along column, will get 10 numbers, array(1, M)
#print(Y1)
#normalizing matrix
X = X*(1/np.std(X,axis=0))
# Add offset attribute
X = np.concatenate((np.ones((X.shape[0],1)),X),1).astype('float')
print(X.shape)
attributeNames = [u'Offset']+attributeNames1
M = M+1
## Crossvalidation
# Create crossvalidation partition for evaluation
K = 10
CV = model_selection.KFold(K, shuffle=True)
#CV = model_selection.KFold(K, shuffle=False)
# Values of lambda
lambdas = np.power(10.,range(-5,9))
# Initialize variables
#T = len(lambdas)
Error_train = np.empty((K,1))
Error_test = np.empty((K,1))
Error_train_rlr = np.empty((K,1))
Error_test_rlr = np.empty((K,1))
Error_train_nofeatures = np.empty((K,1))
Error_test_nofeatures = np.empty((K,1))
w_rlr = np.empty((M,K))
mu = np.empty((K, M-1))
sigma = np.empty((K, M-1))
w_noreg = np.empty((M,K))
k=0
opt_lambdas = []
for train_index, test_index in CV.split(X,y):
# extract training and test set for current CV fold
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
internal_cross_validation = 10
opt_val_err, opt_lambda, mean_w_vs_lambda, train_err_vs_lambda, test_err_vs_lambda = rlr_validate(X_train, y_train, lambdas, internal_cross_validation)
opt_lambdas.append(opt_lambda)
# Standardize outer fold based on training set, and save the mean and standard
# deviations since they're part of the model (they would be needed for
# making new predictions) - for brevity we won't always store these in the scripts
mu[k, :] = np.mean(X_train[:, 1:], 0)
sigma[k, :] = np.std(X_train[:, 1:], 0)
X_train[:, 1:] = (X_train[:, 1:] - mu[k, :] ) / sigma[k, :]
X_test[:, 1:] = (X_test[:, 1:] - mu[k, :] ) / sigma[k, :]
Xty = X_train.T @ y_train
XtX = X_train.T @ X_train
# Compute mean squared error without using the input data at all
Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum(axis=0)/y_train.shape[0]
Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum(axis=0)/y_test.shape[0]
# Estimate weights for the optimal value of lambda, on entire training set
lambdaI = opt_lambda * np.eye(M)
lambdaI[0,0] = 0 # Do no regularize the bias term
w_rlr[:,k] = np.linalg.solve(XtX+lambdaI,Xty).squeeze()
# Compute mean squared error with regularization with optimal lambda
Error_train_rlr[k] = np.square(y_train-X_train @ w_rlr[:,k]).sum(axis=0)/y_train.shape[0]
Error_test_rlr[k] = np.square(y_test-X_test @ w_rlr[:,k]).sum(axis=0)/y_test.shape[0]
# Estimate weights for unregularized linear regression, on entire training set
w_noreg[:,k] = np.linalg.solve(XtX,Xty).squeeze()
# Compute mean squared error without regularization
Error_train[k] = np.square(y_train-X_train @ w_noreg[:,k]).sum(axis=0)/y_train.shape[0]
Error_test[k] = np.square(y_test-X_test @ w_noreg[:,k]).sum(axis=0)/y_test.shape[0]
# OR ALTERNATIVELY: you can use sklearn.linear_model module for linear regression:
#m = lm.LinearRegression().fit(X_train, y_train)
#Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
#Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]
# Display the results for the last cross-validation fold
if k == K-1:
figure(k, figsize=(12,8))
subplot(1,2,1)
#print(mean_w_vs_lambda.shape)
semilogx(lambdas,mean_w_vs_lambda.T[:,1:],'.-') # Don't plot the bias term
xlabel('Regularization factor')
ylabel('Mean Coefficient Values')
grid()
# You can choose to display the legend, but it's omitted for a cleaner
# plot, since there are many attributes
#legend(attributeNames[1:], loc='best')
subplot(1,2,2)
title('Optimal lambda: 1e{0}'.format(np.log10(opt_lambda)))
loglog(lambdas,train_err_vs_lambda.T,'b.-',lambdas,test_err_vs_lambda.T,'r.-')
xlabel('Regularization factor')
ylabel('Squared error (crossvalidation)')
legend(['Train error','Validation error'])
grid()
# To inspect the used indices, use these print statements
#print('Cross validation fold {0}/{1}:'.format(k+1,K))
#print('Train indices: {0}'.format(train_index))
#print('Test indices: {0}\n'.format(test_index))
k+=1
show()
# Display results
print('Linear regression without feature selection:')
print('- Training error: {0}'.format(Error_train.mean()))
print('- Test error: {0}'.format(Error_test.mean()))
print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
print('- R^2 test: {0}\n'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
print('Regularized linear regression:')
print('- Training error: {0}'.format(Error_train_rlr.mean()))
print('- Test error: {0}'.format(Error_test_rlr.mean()))
print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_rlr.sum())/Error_train_nofeatures.sum()))
print('- R^2 test: {0}\n'.format((Error_test_nofeatures.sum()-Error_test_rlr.sum())/Error_test_nofeatures.sum()))
print('Weights in last fold:')
for m in range(M):
print('{:>15} {:>15}'.format(attributeNames[m], np.round(w_rlr[m,-1],2)))
print('\n +++++++ baseline output ++++++++')
print('test errors: {}'.format(Error_test_nofeatures))
print('\n +++++++linear regression output ++++++++')
print('Optimized lambdas: {}'.format(opt_lambdas))
print('test errors: {}'.format(Error_test_rlr))