-
Notifications
You must be signed in to change notification settings - Fork 0
/
binary_statistics.py
140 lines (122 loc) · 4.73 KB
/
binary_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from scipy import stats
import json
from pprint import pprint
import numpy as np
import pandas as pd
import copy
import os
from collections import OrderedDict
from echr_experiments.config import ROUND_DIGITS
from echr_experiments.format import sort_article
from echr_experiments.utils import save
RESULT_PATH = 'data/output/result_binary.json'
dataset_short = OrderedDict([
("Descriptive features only", "desc"),
("Bag-of-Words only", "BoW"),
("Descriptive features and Bag-of-Words", "both")
])
def data_to_article(_data):
data = {}
prev = {}
for entry in _data.keys():
d = entry.split(' - ')
article = d[0]
dataset = d[1]
if article not in data:
data[article] = {}
if article not in prev:
prev[article] = _data[entry]['filter']['prevalence']
if "methods" in _data[entry]:
for method, d in _data[entry]["methods"].items():
if method not in data[article]:
data[article][method] = {}
data[article][method][dataset] = d
return data, prev
def generate_samples_per_article(name, _data, key="acc", order=max):
data = copy.deepcopy(_data)
best_per_dataset = {}
for method, datasets in _data.items():
for dataset, res in datasets.items():
if dataset not in best_per_dataset:
best_per_dataset[dataset] = np.round_(res['test']['test_{}'.format(key)], 4)
else:
best_per_dataset[dataset] = order(best_per_dataset[dataset], np.round_(res['test']['test_{}'.format(key)], 4))
average = 0.
sample_bow = []
sample_bow_desc = []
max_m = max([len(m) for m in data.keys()])
for i, method in enumerate(sorted(data.keys())):
for i, dataset in enumerate(list(dataset_short.keys())[1:]):
if dataset in data[method]:
d = data[method][dataset]
val = np.round_(d['test']['test_{}'.format(key)], 4)
if i == 1:
sample_bow.append(val)
else:
sample_bow_desc.append(val)
#print('BoW only')
print('\n'.join(map(str, sample_bow)))
print('\n')
#print('BoW + desc')
print('\n'.join(map( str,sample_bow_desc)))
def generate_samples_per_method(name, _data, key="acc", std=True, order=max):
sample_bow = {}
sample_bow_desc = {}
data_per_article, prev = data_to_article(_data)
for article, d in data_per_article.items():
data = copy.deepcopy(d)
best_per_dataset = {}
for method, datasets in data.items():
for dataset, res in datasets.items():
if dataset not in best_per_dataset:
best_per_dataset[dataset] = np.round_(res['test']['test_{}'.format(key)], 4)
else:
best_per_dataset[dataset] = order(best_per_dataset[dataset], np.round_(res['test']['test_{}'.format(key)], 4))
average = 0.
for i, method in enumerate(sorted(data.keys())):
if method not in sample_bow:
sample_bow[method] = []
if method not in sample_bow_desc:
sample_bow_desc[method] = []
for i, dataset in enumerate(list(dataset_short.keys())[1:]):
if dataset in data[method]:
d = data[method][dataset]
val = np.round_(d['test']['test_{}'.format(key)], 4)
if i == 1:
sample_bow[method].append(val)
else:
sample_bow_desc[method].append(val)
for k in sample_bow.keys():
print('# {}'.format(k))
#print('BoW only')
print('\n'.join(map(str, sample_bow[k])))
print('\n')
#print('BoW + desc')
print('\n'.join(map( str,sample_bow_desc[k])))
print('\n')
print(stats.wilcoxon(sample_bow[k], sample_bow_desc[k]))
def main():
with open(RESULT_PATH) as f:
_data = json.load(f)
"""
RESULTS PER ARTICLE
"""
data_per_article, prev = data_to_article(_data)
keys = ['acc'] #, 'mcc', 'precision', 'recall', 'f1_weighted'] #, 'balanced_acc']
for article, data in data_per_article.items():
for key in keys:
print(' # {} - {}'.format(article, key))
generate_samples_per_article(article, data, key=key, order=max)
print('\n')
keys = [
('acc', 'Accuracy'),
#('mcc', "Matthew Correlation Coefficient"),
#('precision', "Precision"),
#('recall', "Recall"),
#('f1_weighted', "F1 score"),
#('balanced_acc', "Balanced accuracy")
]
for key in keys:
generate_samples_per_method(key[1], _data, key[0], order=max)
if __name__ == "__main__":
main()