This repository has been archived by the owner on Jun 20, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
116 lines (95 loc) · 3.68 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
import json
import operator
from itertools import chain, groupby, tee, izip
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)
def accumulate(iterable, func=operator.add):
"accumulate([1,2,3,4,5]) --> 1 3 6 10 15"
it = iter(iterable)
total = next(it)
yield total
for element in it:
total = func(total, element)
yield total
def masslist_to_indexlist(masslist):
[indexlist], m = masses_to_indexes([masslist])
return indexlist, m
def masses_to_indexes(mass_lists):
"""
Convert segmentations represented as mass sequences to segmentations
represented as boundary index sequences, plus total mass.
"""
sums = [ list(accumulate(m)) for m in mass_lists ]
[index_lists, total_masses] = zip(*[ (l[:-1],l[-1]) for l in sums ])
assert 1 == len(set(total_masses)) # make sure they all add up the same
return index_lists, total_masses[0]
def zip_differences(l):
return [ operator.sub(*pair) for pair in zip(l[1:], l[0:]) ]
def indexlist_to_masslist(indexlist, m):
[masslist] = indexes_to_masses([indexlist], m)
return masslist
def indexes_to_masses(index_lists, m):
"""
Given a total mass `m`, convert segmentations represented as
boundary index sequences to segmentations represented as mass
sequences.
"""
return [ zip_differences([0] + l + [m]) for l in index_lists ]
def assert_same_coders(documents):
coders = [ set(segmentations.keys()) for segmentations in documents.values() ]
assert set.intersection(*coders) == set.union(*coders)
def per_document_coefficients(documents, f):
"""
Expects a dict with document IDs as keys and ``{coder:segmentation}``
dicts as values. Note that each document must have the same set of
coders, or an exception will be thrown.
"""
assert_same_coders(documents)
# make sure the segmentations are always in the same order
return { doc_id: f([ segs[a] for a in sorted(segs) ] )
for doc_id, segs in documents.items() }
def flattened(dicts):
"Flatten a sequence of dictionaries into a single sequence of tuples."
return list(chain(*[d.items() for d in dicts]))
def overall_segmentations(documents):
"""
Expects a dict with document IDs as keys and ``{coder:segmentation}``
dicts as values. Note that each document must have the same set of
coders, or an exception will be thrown.
"""
assert_same_coders(documents)
# make sure segmentations are in document order
[doc_ids, segmentations] = zip(*sorted(documents.items()))
key = lambda x: x[0]
return [ list(chain(*[ x[1] for x in g ]))
for c,g in groupby(sorted(flattened(segmentations), key=key), key) ]
def overall_coefficient(documents, f):
return f(overall_segmentations(documents))
def get_results(documents, f):
return (per_document_coefficients(documents, f),
overall_coefficient(documents, f))
def load_segmentation_data(filename):
with open(filename) as data:
return json.load(data)
def error(variance, interval=0.95):
if interval == 0.95:
return 1.96*(variance**.5)
if interval == 0.5:
return 0.67*(variance**.5)
raise Exception('interval must be .95 or .5')
def format_coefficient(c, v):
return '{:.2f}±{:.2f}'.format(c, error(v))
def print_coefficient(label, c, v):
print '{}: {:.2f}±{:.2f}'.format(label, c, error(v))
def print_coefficients(d):
for doc, (c,v) in sorted(d.items(), key=lambda x: x[1], reverse=True):
print_coefficient(doc.split(':')[1], c, v)
def filter_coders(documents, keep):
[doc_ids, segmentations] = zip(*sorted(documents.items()))
filtered = [ { k:v for k,v in s.items() if k in keep }
for s in segmentations ]
return dict(zip(doc_ids, filtered))