-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
62 lines (51 loc) · 1.68 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#What this file do?
#This file reads the docs , tokenize them into tokens
# , stem the tokens , lowercase those tokens , sort them and
# store them in terms.txt file.
from operator import itemgetter
from nltk.stem.porter import PorterStemmer
from markupsafe import re
stopwords=[]
#assign stopwords list from Stopword-List.txt
def initialize_stopwords():
f=open("Stopword-List.txt","r")
for line in f:
for word in line.split():
stopwords.append(word)
f.close()
#read the documents one by one and tokenize them into tokens
def tokenize():
count=0
tokens=[]
docId=[]
pos=[]
for i in range(448):
j=i+1
f=open("Abstracts/"+str(j)+".txt","r",encoding="latin1")
k=0
for line in f:
for word in re.findall(r"[\w']+",line):
word=word.replace("'","")
#In the if block we check if the word is in the stopword list
if(word not in stopwords):
tokens.append(word)
pos.append(k)
docId.append(j)
k=k+1
count=count+1
f.close()
return {"tokens":tokens,"docId":docId,"pos":pos}
def linguistics(tokens):
stemmer=PorterStemmer() #does the stemming as well as lowercasing.
terms=[] #tokens after stemming and normalization
index=0
for token in tokens["tokens"]:
terms.append([stemmer.stem(token),tokens["docId"][index],tokens["pos"][index]])
index=index+1
return terms
initialize_stopwords()
tokens=tokenize()
terms=linguistics(tokens)
sorted_terms=sorted(terms,key=itemgetter(0))
sf=open("terms.txt","w")
sf.write(str(sorted_terms))