-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path11_create_final_dataset.py
64 lines (57 loc) · 3.02 KB
/
11_create_final_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import csv
import numpy as np
from utilwebisadb import set_csv_field_size
def generate_final_dataset_all(full_file_path, prediction_path, out_path):
correct_relation_to_uncertainty = {}
incorrect_relation_to_uncertainty = {}
with open(prediction_path) as prediction_file:
prediction_reader = csv.reader(prediction_file)
for i, row in enumerate(prediction_reader):
if row[76] == 'yes':
correct_relation_to_uncertainty[int(float(row[73]))] = float(row[75])
if row[76] == 'no':
incorrect_relation_to_uncertainty[int(float(row[73]))] = float(row[75])
print(len(correct_relation_to_uncertainty))
with open(full_file_path) as full_file, open(out_path, 'w', newline='') as outfile:
reader = csv.reader(full_file)
writer = csv.writer(outfile)
for row in reader:
uncertainty = correct_relation_to_uncertainty.get(int(row[0]), None)
if uncertainty is not None:
writer.writerow(row + ['yes', uncertainty])
uncertainty = incorrect_relation_to_uncertainty.get(int(row[0]), None)
if uncertainty is not None:
writer.writerow(row + ['no', uncertainty])#uncertainty is always for the positive class
print('min positive theshold: {}'.format(min(correct_relation_to_uncertainty.values())))
print('max negative theshold: {}'.format(max(incorrect_relation_to_uncertainty.values())))
def generate_final_dataset(full_file_path, prediction_path, out_path):
correct_relation_to_uncertainty = {}
with open(prediction_path) as prediction_file:
prediction_reader = csv.reader(prediction_file)
for i, row in enumerate(prediction_reader):
if row[76] == 'yes':
correct_relation_to_uncertainty[int(float(row[73]))] = float(row[75])
print(len(correct_relation_to_uncertainty))
with open(full_file_path) as full_file, open(out_path, 'w', newline='') as outfile:
reader = csv.reader(full_file)
writer = csv.writer(outfile)
for row in reader:
uncertainty = correct_relation_to_uncertainty.get(int(row[0]), None)
if uncertainty is not None:
writer.writerow(row + ['yes', uncertainty])
def generate_bins(prediction_path):
yes_scores = []
with open(prediction_path) as prediction_file:
prediction_reader = csv.reader(prediction_file)
for i, row in enumerate(prediction_reader):
try:
yes_scores.append(float(row[75]))
except ValueError:
pass
np_yes_scores = np.array(yes_scores)
print(np.histogram(np_yes_scores, bins=np.linspace(0,1,num=21)))
if __name__ == "__main__":
#set_csv_field_size()
#generate_final_dataset_all('webisa_1_with_sent.csv', 'webisa_1_with_sent_analysis/prediction.csv', 'webisa_1_final.csv')
#generate_final_dataset('webisa_1_with_sent.csv', 'webisa_1_with_sent_analysis/prediction.csv', 'webisa_1_final.csv')
generate_bins('webisa_1_with_sent_analysis/prediction.csv')