-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarizers.py
148 lines (132 loc) · 4.87 KB
/
summarizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
import pdb
import time
import numpy as np
#import rnn_language
from functions import simple_red, uni_cos_red, rnn_group_red, cnn_red
from heapq import heappush, heapify, heappop, nsmallest
__author__ = 'matteo'
# summarize a collection
def multi_lead(collection, num_words, max_sent):
start = time.time()
h = []
for d in collection.docs.values():
for s in d.sent.values():
lead_w = np.zeros(len(list(s[1])))
lead_w[0] = 1
rel = np.dot(np.asarray(list(s[1])), lead_w)
if len(s[0])<350: # modify simultaneously as line 270 of data_structures.py
heappush(h, (-1*rel, s[0]))
cw = cs = 0
most_rel = []
while cw<num_words and cs<max_sent:
cand = heappop(h)
add = len(cand[1].split())
if (cw + add)<num_words:
most_rel.append(re.sub('-',' ',re.sub('\s+', ' ', cand[1])).strip())
cw += add
cs += 1
print "lead: %f seconds" % (time.time() - start)
return most_rel
# summarize a collection according to relevance score
def rel_summarize(collection, clf, num_words, max_sent, idx):
start = time.time()
h = []
for d in collection.docs.values():
for s in d.sent.values():
rel = clf.predict(s[1][idx])
if len(s[0])<350: # modify simultaneously as line 270 of data_structures.py
heappush(h, (-1*rel, s[0]))
cw = cs = 0
most_rel = []
while cw<num_words and cs<max_sent:
try:
cand = heappop(h)
except:
pdb.set_trace()
add = len(cand[1].split())
if (cw + add)<num_words:
most_rel.append(re.sub('-',' ',re.sub('\s+', ' ', cand[1])).strip())
cw += add
cs += 1
print "supervised, greedy extraction: %f seconds" % (time.time() - start)
return most_rel
# maximum marginal relevance summarization
def mmr_summarize(collection, clf, ext_algo, red_algo, num_words, max_sent, tradeoff, idx, w2v=None):
if ext_algo=="greedy":
start = time.time()
h = []
dict = {}
for d in collection.docs.values():
for s in d.sent.values():
rel = clf.predict(s[1][idx])
if len(s[0])<350: # modify simultaneously as line 270 of data_structures.py
heappush(h, (-1*tradeoff*rel, s[0]))
dict[s[0]]= rel
h = nsmallest(max_sent, h)
cs = 1
cw = 0
flag = False
most_rel = []
first = re.sub('-',' ',re.sub('\s+', ' ', heappop(h)[1])).strip()
most_rel.append(first)
cw += len(first.split())
while cw<num_words-10 and cs<max_sent:
if flag:
evaluate_redundancy(h, dict, most_rel, tradeoff, red_algo, w2v)
heapify(h)
flag = False
try:
cand = heappop(h)
except IndexError:
break
add = len(cand[1].split())
if (cw + add)<num_words:
most_rel.append(re.sub('-',' ',re.sub('\s+', ' ', cand[1])).strip())
cw += add
flag = True
cs += 1
print "supervised, greedy extraction: %f seconds" % (time.time() - start)
return most_rel
elif ext_algo=='dyn-prog':
pass
else:
raise Exception('Extract Summary: Invalid algorithm')
# update overall score of each sentences according to the given redundancy measure
def evaluate_redundancy(c, d, summ, tradeoff, red_algo, w2v=None):
if red_algo=="simpleRed":
for i in xrange(len(c)):
s = c[i][1]
t = summ[-1]
k = float(len(summ))
rel = -1 * tradeoff * d[s]
red = (1-tradeoff) * simple_red(s, t)
prev_score = c[i][0]
old_red_sum = (prev_score-rel)*(k-1)
update = (old_red_sum + red)/k
c[i] = (rel + update, s)
elif red_algo=="uniCosRed":
for i in xrange(len(c)):
c[i] = (-1*tradeoff*d[c[i][1]] + (1-tradeoff)*uni_cos_red(c[i][1], summ), c[i][1])
elif red_algo=="groupRnnEmbedding":
for i in xrange(len(c)):
c[i] = (-1*tradeoff*d[c[i][1]] + (1-tradeoff)*rnn_group_red(c[i][1], summ), c[i][1])
elif red_algo=="cnn":
for i in xrange(len(c)):
s = c[i][1]
t = summ[-1]
k = float(len(summ))
rel = -1 * tradeoff * d[s]
red = (1-tradeoff) * cnn_red(s, t, w2v)
prev_score = c[i][0]
old_red_sum = (prev_score-rel)*(k-1)
update = (old_red_sum + red)/k
c[i] = (rel + update, s)
else:
raise Exception('Redundancy Measure: Invalid algorithm')
# choose best order for a set of extracted sentences
def reorder(sent_list, algorithm):
pass
# process cross sentence references
def preprocess_crossreferences(corpus):
pass