-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkeywords_Extraction.py
157 lines (127 loc) · 5.68 KB
/
keywords_Extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Importing the libraries
from collections import defaultdict
import re
import pandas as pd
from collections import Counter
from sklearn import preprocessing
import PyPDF2
###############################################################################
#step 1. Extracting text from PDF file
# creating a pdf file object
pdfFileObj = open('JavaBasics-notes.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
numPages = pdfReader.numPages
# collecting text in each page
text = []
for i in range(numPages):
# creating a page object
pageObj = pdfReader.getPage(i)
# extracting text from page
text.append(pageObj.extractText())
# closing the pdf file object
pdfFileObj.close()
###############################################################################
# Step2 : data preprocessing
# Expand some words
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
phrase = re.sub(r" v", " very", phrase)
return phrase
file_text = []
for i in range(len(text)):
file_text.append(decontracted(text[i]))
#cleaning unwanted symbols
comment_dict = defaultdict(list)
for i in range(len(file_text)):
sentence = file_text[i]
sentence = sentence.lower()
sentence = sentence.split('.')
for k in range(len(sentence)):
review = sentence[k].split()
sentence[k] = ' '.join(review)
comment_dict[i].append(sentence[k])
#delete unwanted '' words
for j in range(len(comment_dict)):
comment_dict[j] = [comment_dict[j][i] for i in range(len(comment_dict[j])) if comment_dict[j][i] not in '']
for i in range(len(comment_dict)):
file_text[i] = ('. '.join(comment_dict[i][j] for j in range(len(comment_dict[i]))))
#collecting all preprocessed text in pdf file.
text = ''
for i in range(len(file_text)):
text = text + file_text[i]
###############################################################################
#Step 3 : Keywords Extraction
# List of possible keywords
keywords = list({'java', 'object', 'abstraction', 'encapsulation', 'inheritance', 'polymorphism', 'class',
'association', 'aggregation', 'composition', 'private', 'protected', 'public',
'abstract', 'extends', 'final', 'implements', 'interface', 'native', 'new', 'static', 'strictfp',
'synchronized', 'transient', 'volatile', 'break', 'case', 'continue', 'default', 'do',
'else', 'for', 'if', 'instanceof', 'return', 'switch', 'while', 'import', 'package', 'boolean', 'byte',
'char', 'double', 'float', 'int', 'long', 'short', 'assert', 'catch', 'finally', 'throw', 'throws', 'try',
'enum', 'super', 'this', 'void', 'const', 'goto', 'boolean', 'byte', 'switch', 'case', 'try', 'catch',
'finally', 'char', 'int', 'continue', 'default', 'do', 'double', 'if', 'else', 'enum', 'extends',
'final', 'float', 'for', 'implements', 'import', 'instanceOf', 'int', 'interface', 'long', 'native',
'new', 'package', 'private', 'protected', 'public', 'return', 'short', 'static', 'strictfp',
'super', 'synchronized', 'this', 'throw', 'throws', 'transient', 'void', 'volatile', 'while',
'goto', 'const', 'abstract', 'assert', 'break', 'operator', 'expression', 'array', 'string',
'premitive', 'declaration', 'interface', 'applet', 'html', 'css', 'javascript', 'robustness'
'security', 'portability', 'garbage collections', 'compile', 'method', 'instance', 'return', 'initialize'})
# Finding the maximum length from all keywords
maxx = 0
for i in range(len(keywords)):
if(len(str(keywords[i])) > maxx):
maxx = len(str(keywords[i]))
#List of all keywords present in pdf text.
count = []
for i in range(len(text)):
for j in range(maxx+1):
if (text[i:i+j] in keywords):
count.append(text[i:i+j])
else:
continue
# No of occurrence of all keywords present in text
c = Counter(count)
keyword_dict = dict()
for i in keywords:
keyword_dict[i] = c[i]
# Sorting keywords in descending order of their number of occurances
from collections import OrderedDict
key_words = OrderedDict(sorted(keyword_dict.items(), key=lambda x: x[1], reverse = True))
# Printing all keywords
print('Keywords :', key_words)
keys = list(key_words.keys())
values = list(key_words.values())
# Removing all keywords which are not present in text of pdf.
index = 0
for i in range(len(values)):
if(values[i] > 0):
index += 1
keys = keys[: index]
values = values[: index]
# Creation of dataframe to calculate normalized score
dataframe = pd.DataFrame()
dataframe['No of occurrence'] = values
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(dataframe)
df_normalized = pd.DataFrame(np_scaled)
temp = list(df_normalized[0])
# Result obtained stored in dataframe named 'df'
df = pd.DataFrame()
df['Keywords'] = keys
df['Normalized Weightage'] = temp
df['No of occurrence'] = values
# Storing result in permanent storage of csv file.
df.to_csv('Result.csv')