-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathTextSummarize.py
83 lines (74 loc) · 3.04 KB
/
TextSummarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from nltk import tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import logging
class TextSummarize:
def __init__(self, min_th = 0.1, max_th = 0.9):
"""
Initialization of the summarizer
:param min_th: words with frequency less than min_th will be ignored
:param max_th: words with frequency higher than max_th will be ignored
"""
self._min_th = min_th
self._max_th = max_th
self._stopwords = set(stopwords.words('english') + list(punctuation))
def _compute_frequencies(self,sentences):
"""
Compute the frequency of each word in the given sentence
:param sentences: list of tokenized sentences
:return: a list that shows the frequency of each word
"""
# we could use the nltk frequency method (maybe later), however
# it is easy to create our own using default dictionary
freq = defaultdict(int)
for sentence in sentences:
for token in sentence:
if token not in self._stopwords:
freq[token] += 1
# now we need to normalize the frequency dict
# by dividing all the values with the highest value
max_freq = float(max(freq.values()))
for token in freq.keys():
freq[token] = freq[token] / max_freq
# now we need to filter out instances based on the thresholds
if freq[token] > self._max_th or freq[token] < self._min_th:
freq[token] = -1
to_be_deleted = []
for key, value in freq.items():
if value == 0:
to_be_deleted.append(key)
for k in to_be_deleted:
del freq[k]
return freq
def summarize(self, text, n):
"""
Summarize the text using the frequencies of the words
:param text:
:param n: number of sentences of the summary
:return:
"""
sentences = tokenize.sent_tokenize(text) # nltk toolbox list of sentences
# check if the summary is longer that the input
if n > len(sentences):
n = int(len(sentences)/2)
logging.warning('Watch out! argument n is larger than the sentences thus the n now is equal with'
'half of the sentences!') # will print a message to the console
token_sent = [ tokenize.word_tokenize( s.lower()) for s in sentences] # nltk list of list of words/tokens
self._freq = self._compute_frequencies(token_sent)
ranking = defaultdict(int)
for i,sent in enumerate(token_sent):
for word in sent:
if word in self._freq:
ranking[i] += self._freq[word]
sents_idx = self._ranking(ranking,n)
return [sentences[j] for j in sents_idx]
def _ranking(self,ranking,n):
"""
Returns the n highest sentences
:param ranking:
:param n:
:return:
"""
return nlargest(n,ranking, key=ranking.get)