-
Notifications
You must be signed in to change notification settings - Fork 1
/
EM_data_preprocessing.py
85 lines (66 loc) · 3.06 KB
/
EM_data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def data_preprocessing_dataset1(data_loc, filename, results_loc):
# Returns a usable data frame given the file name
FEATURES = ['X1', 'X2']
data_path = os.path.join(data_loc, filename)
data = pd.read_csv(data_path, header=None, sep='\s+', names=FEATURES)
n_observations = len(data)
K = 2
label = np.empty(shape=(0, 1))
for i in range(K):
label = np.append(label, np.ones((n_observations // K, 1), dtype=int) + i, axis=0)
label = pd.DataFrame(label, columns=['label'], dtype=int)
obs_nr = pd.DataFrame(np.arange(1, n_observations + 1), dtype=int, columns=['obs_i'])
data = pd.concat([obs_nr, data, label], axis=1, join='inner')
data = data.sample(frac=1) # Shuffle rows
plot_file_name = filename + '_raw_data_scatter.pdf'
plt.scatter(data['X1'], data['X2'], s=1)
plt.savefig(os.path.join(results_loc, plot_file_name))
# plt.show()
plt.close()
return data, FEATURES, len(FEATURES)
def data_preprocessing_dataset2(data_loc, filename, results_loc):
# Returns a usable data frame given the file name
FEATURES = ['X1', 'X2']
data_path = os.path.join(data_loc, filename)
data = pd.read_csv(data_path, header=None, sep='\s+', names=FEATURES)
n_observations = len(data)
K = 3
label = np.empty(shape=(0, 1))
for i in range(K):
label = np.append(label, np.ones((n_observations // K, 1), dtype=int) + i, axis=0)
label = pd.DataFrame(label, columns=['label'], dtype=int)
obs_nr = pd.DataFrame(np.arange(1, n_observations + 1), dtype=int, columns=['obs_i'])
data = pd.concat([obs_nr, data, label], axis=1, join='inner')
data = data.sample(frac=1) # Shuffle rows
plot_file_name = filename + '_raw_data_scatter.pdf'
plt.scatter(data['X1'], data['X2'], s=1)
plt.savefig(os.path.join(results_loc, plot_file_name))
# plt.show()
plt.close()
return data, FEATURES, len(FEATURES)
def data_preprocessing_dataset3(data_loc, filename, label_file, results_loc):
# Returns a usable data frame given the file name
FEATURES = ['X1', 'X2']
data_path = os.path.join(data_loc, filename)
data = pd.read_csv(data_path, header=None, sep='\s+', names=FEATURES)
n_observations = len(data)
K = 2
# label = np.empty(shape=(0, 1))
# for i in range(K):
# label = np.append(label, np.ones((n_observations // K, 1), dtype=int) + i, axis=0)
# label = pd.DataFrame(label, columns=['label'], dtype=int)
label_path = os.path.join(data_loc, label_file)
label = pd.read_csv(label_path, header=None, sep='\s+', names=['label'])
obs_nr = pd.DataFrame(np.arange(1, n_observations + 1), dtype=int, columns=['obs_i'])
data = pd.concat([obs_nr, data, label], axis=1, join='inner')
data = data.sample(frac=1) # Shuffle rows
plot_file_name = filename + '_raw_data_scatter.pdf'
plt.scatter(data['X1'], data['X2'], s=1)
plt.savefig(os.path.join(results_loc, plot_file_name))
# plt.show()
plt.close()
return data, FEATURES, len(FEATURES)