-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexperiment5.py
129 lines (117 loc) · 4.18 KB
/
experiment5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Performance experiment.
The code in all experiment files is very ugly, but it I did not see the use in creating beautifull
code for all experiments since it is a 'press enter and rerun them' setup.
"""
import time
from schema_matching import *
from schema_matching.misc_func import *
from sklearn.metrics import accuracy_score
from pandas_ml import ConfusionMatrix
from sklearn.metrics import confusion_matrix
import copy
from os.path import isfile
import os
import math
import sys
import numpy as np
def get_column(test_folder='data_test/'):
sr = Schema_Reader()
col = []
for filename in sorted(os.listdir(test_folder)):
print(filename)
path = test_folder + filename
if(isfile(path)):
headers, columns = sr.get_duplicate_columns(path, True)
# print(columns[0])
# print(headers[0])
return columns[0]
def experiment5():
data_folder = 'data_train/'
class1 = ['city', 'country', 'date', 'gender', 'house_number',\
'legal_type', 'postcode', 'province', 'sbi_code', 'sbi_description', 'telephone_nr']
class2 = ['sbi_description']
data_map = {
"1": class1,
"2": class2
}
gm_train_time = Graph_Maker()
gm_classify_time = Graph_Maker()
num_classifiers = 5
matcher_classes = ["Fingerprint_Matcher", "Syntax_Matcher", "Word2Vec_Matcher"]
feature_classes = ["Fingerprint", "Syntax_Feature_Model", "Corpus"]
dict_args = ["fingerprint", "syntax", "corpus"]
feature_names = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5']
matcher_names = ['matcher1', 'matcher2', 'matcher3', 'matcher4', 'matcher5']
sf_main = Storage_Files(data_folder, data_map)
number_of_columns = [100, 1, 100]
number_of_examples = [100, 10000, 100]
col = get_column()
for i in range(0, len(matcher_classes)):
"""
Loop through the matchers and execute the tests
"""
cccs = []
train_times = []
classification_times = []
matcher_type = matcher_classes[i]
print(matcher_type)
feature_class = feature_classes[i]
dict_arg = dict_args[i]
ccc = Column_Classification_Config()
num_cols = number_of_columns[i]
num_ex = number_of_examples[i]
total_time = 0
for i in range(0, num_classifiers):
"""
Iteratively add a matcher to the block, we also add the matcher to a list to later test
the speed.
"""
feature_name = feature_names[i]
matcher_name = matcher_names[i]
elapsed = 0
if i == 0:
start = time.clock()
ccc.add_feature(feature_name, feature_class, [sf_main, num_cols, num_ex, False, True])
ccc.add_matcher(matcher_name, matcher_type, {feature_name: dict_arg}) # main classifier
elapsed = time.clock() - start
else:
prev_matcher = matcher_names[i - 1]
start = time.clock()
ccc.add_feature(feature_name, feature_class, [sf_main, num_cols, num_ex, False, True])
ccc.add_matcher(matcher_name, matcher_type, {feature_name: dict_arg}, (prev_matcher, '1')) # main classifier
elapsed = time.clock() - start
total_time += elapsed
train_times.append(total_time)
cccs.append(copy.deepcopy(ccc))
for c in cccs:
# print(c)
# print("--------------")
tmp = []
for i in range(0, 100):
sm = Schema_Matcher(c)
start = time.clock()
outcome = sm.classify_column([col])
if outcome != "1":
print("Not all matchers have been passed")
sys.exit(0)
elapsed = time.clock() - start
tmp.append(elapsed)
classification_times.append(np.mean(tmp))
gm_classify_time.append_y(classification_times)
gm_train_time.append_y(train_times)
gm_train_time.add_x(list(range(1, num_classifiers + 1)))
gm_classify_time.add_x(list(range(1, num_classifiers + 1)))
print(gm_train_time)
print(gm_classify_time)
gm_train_time.store("/graph_maker/exp1.5_train")
gm_classify_time.store("/graph_maker/exp1.5_class")
xlabel = "Number of Matchers"
ylabel = "Time in Seconds"
subtitle = "Number of Colums: " + str(number_of_columns[0]) + "\nNumber of Examples per Column: " + str(number_of_examples[0])
gm_train_time.plot_line_n(xlabel, "Training " + ylabel, "Number of Matchers vs Training Time", feature_classes ,subtitle=subtitle)
gm_classify_time.plot_line_n(xlabel, "Classification " + ylabel, "Number of Matchers vs Classification Time", \
feature_classes ,subtitle=subtitle)
if __name__ == '__main__':
experiment5()
# get_column()