-
Notifications
You must be signed in to change notification settings - Fork 8
/
top_sent.py
32 lines (27 loc) · 1.12 KB
/
top_sent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
import spacy
nlp = spacy.load('en_core_web_sm')
def top_sentences(page_text):
summarized_pages=[]
for i in page_text:
try:
textt=""
pattern = r'\[\d+]+'
text = re.sub(pattern, '', i)
text=text.replace("\n"," ")
# parse the text using Spacy
doc = nlp(text)
# create a list of (sentence, score) tuples based on sentence similarity
sentences = [(sent.text.strip(), sent.similarity(doc))
for sent in doc.sents]
# sort the list in descending order of similarity score and select top 5 sentences
top_sentences = sorted(sentences, key=lambda x: x[1], reverse=True)[:5]
# print the top 5 sentences
for i, (sentence, score) in enumerate(top_sentences):
textt += "".join(sentence)
# print(f'Top {i+1} sentence: {sentence}\nSimilarity score: {score:.2f}\n')
summarized_pages.append([textt])
except:
pass
print("error top_sent")
return summarized_pages