-
Notifications
You must be signed in to change notification settings - Fork 27
/
log_training_to_arff.py
95 lines (81 loc) · 2.95 KB
/
log_training_to_arff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Copyright (C) 2012 Brian Wesley Baugh
"""Generates a Weka ARFF file from the labelled question-answer pairs."""
# Constants
CORPUS_FNAME = "log_training.txt"
CLASS_ATTRIBUTE = "__class__"
FEATURES = (
'page_cosine_sim NUMERIC',
'term_count NUMERIC',
'related_sum NUMERIC',
'related_average NUMERIC',
'causal_match {True,False}',
'position NUMERIC',
'text_length NUMERIC',
)
SPAN = len(FEATURES) + 1
def write_header(fileobj, class_list):
comments = """\
% This file generated automatically from the following filename:
% {}
"""
comments = comments.format(CORPUS_FNAME)
fileobj.write(comments)
relation = """\
@RELATION "{}"
"""
relation = relation.format(CORPUS_FNAME)
fileobj.write(relation)
fileobj.write('\n')
for attribute in FEATURES:
fileobj.write('@ATTRIBUTE {}'.format(attribute) + '\n')
fileobj.write('\n')
class_attribute = """\
@ATTRIBUTE {} {{{}}}
"""
class_attribute = class_attribute.format(CLASS_ATTRIBUTE,
','.join(class_list))
fileobj.write(class_attribute)
fileobj.write('\n')
def write_data(fileobj):
fileobj.write('@DATA\n')
with open(CORPUS_FNAME) as f:
for line in f:
ir_query, query, answer_positions, answers = line.split('\t', 3)
answer_positions = answer_positions.split(',')
if '0' in answer_positions:
continue
answers = answers.split('\t')
answers = ['\t'.join(answers[i:i + SPAN]) for i in
xrange(0, len(answers), SPAN)]
correct = []
wrong = []
for answer in answers:
rank, answer = answer.split('\t', 1)
answer = answer.split('\t')
if len(answer) != len(FEATURES):
print 'ERROR ON LINE:'
print line
raise ValueError
answer[-1] = answer[-1].strip()
if rank in answer_positions:
answer.append('1')
correct.append(answer)
else:
answer.append('-1')
wrong.append(answer)
# Oversample the minority class (correct answers) in order
# to have an equal number of correct and incorrect instances.
for answer in correct * (len(wrong) / len(correct)) + wrong:
fileobj.write(','.join(answer) + '\n')
def build_class_list():
return set(['1', '-1'])
def main():
with open(CORPUS_FNAME + '.arff', mode='w') as arff:
print "Building class_list"
class_list = build_class_list()
print "Writing ARFF file"
write_header(arff, class_list)
write_data(arff)
print "Done!"
if __name__ == '__main__':
main()