-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTopic Modeling.py
199 lines (153 loc) · 6.67 KB
/
Topic Modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import wordcloud
import gensim
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from pprint import pprint
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
##### Loading Data set #####
MainData = pd.read_csv('articles1.csv')
pd.set_option('max_columns', None)
print("_________________________________________________")
print("Loading data done.")
print("_________________________________________________")
####################################
##### Preprocessing Data set #####
# Index column in original dataset we renamed it then dropped it #
MainData.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)
# dropping all columns that doesn't affect the topic modeling #
MainData = MainData.drop(columns=['Name', 'id', 'title', 'author', 'publication', 'date', 'year', 'month', 'url']
, axis=1)
### Model_1 Part ###
def remove_punc(data):
# Function to remove punctuation from data #
punc = '''!()-[]{};:'",<>./?@#$%^&*_~'''
for ele in data:
if ele in punc:
data = data.replace(ele, "")
return data
def convert_lower(data):
# Function to convert all upper case letters to lower case #
for column in data.columns:
data[column] = data[column].str.lower()
return data
# Calling the functions to apply preprocessing on the dataset #
MainData = remove_punc(MainData)
MainData = convert_lower(MainData)
stop_words = stopwords.words("english")
# Count Vectorizer tokenizes the text alongside with removing stop words #
CountVec = CountVectorizer(stop_words=stop_words)
# Convert the matrix to numerical matrix #
Data = CountVec.fit_transform(MainData.content)
### Model_2 Part ###
def remove_stopwords(texts):
# Function to remove stop words from data #
return [[word for word in simple_preprocess(str(doc))
if word not in stop_words] for doc in texts]
def sent_to_words(sentences):
for sentence in sentences:
yield gensim.utils.simple_preprocess(str(sentence), deacc=True)
# deacc parameter is for removing accent marks #
# convert data to list #
Model_2_Data = MainData.content.tolist()
# Calling the functions to apply preprocessing on the dataset #
Data_Words = list(sent_to_words(Model_2_Data))
Data_Words = remove_stopwords(Data_Words)
print("Tokenized words sample:", Data_Words[:1][0][:50])
Dict = corpora.Dictionary(Data_Words)
Corpus = [Dict.doc2bow(text) for text in Data_Words]
print("numerical representation (bag of words) sample:", Corpus[:1][0][:50])
print("_________________________________________________")
print("Preprocessing done.")
print("_________________________________________________")
#####################################
##### Data Visualization #####
# Word cloud generates an image with the most common words in the given data #
Words = " ".join(MainData.content)
wordcloud = wordcloud.WordCloud()
wordcloud.generate(Words)
wordcloud.to_image().show()
def plot_most_common_words(data_count, countvec):
# Function to plot a graph that shows the count of the most common words #
words = countvec.get_feature_names()
total_counts = np.zeros(len(words))
for t in data_count:
total_counts += t.toarray()[0]
count_dict = (zip(words, total_counts))
count_dict = sorted(count_dict, key=lambda x: x[1], reverse=True)[0:10]
words = [w[0] for w in count_dict]
counts = [w[1] for w in count_dict]
x_pos = np.arange(len(words))
plt.bar(x_pos, counts, align='center')
plt.xticks(x_pos, words, rotation=90)
plt.xlabel('Words')
plt.ylabel('Count')
plt.title('Most common words')
plt.show()
plot_most_common_words(Data, CountVec)
print("Visualization done.")
print("_________________________________________________")
#####################################
##### Model_1 Training #####
Train_Data, Test_Data = train_test_split(Data, test_size=0.20, random_state=0)
number_topics = 10
number_words = 10
# sklearn model #
LDA = LatentDirichletAllocation(n_components=number_topics, random_state=0)
LDA.fit(Train_Data)
print("Model_1 Training done.")
print("_________________________________________________")
##### Model Evaluation and Results #####
def print_topics(lda_model, feature_names, n_top_words):
# Function to print each topic alongside with its most common words #
words = feature_names.get_feature_names()
topic_words = lda_model.components_
for topic_idx, topic in enumerate(topic_words):
print("\nMost common words in topic %d:" % (topic_idx + 1))
# sort top words according to their value
print(", ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
# # Calling the function to print #
print("Topics found via Sklearn LDA:")
print_topics(LDA, CountVec, number_words)
print("_________________________________________________")
# Model_1 evaluation with perplexity metric #
Train_Prep = LDA.perplexity(Train_Data)
print("Train perplexity: ", Train_Prep)
Test_Prep = LDA.perplexity(Test_Data)
print("Test perplexity: ", Test_Prep)
print("_________________________________________________")
##### Model_2 Training #####
G_LDA = LdaModel(corpus=Corpus, id2word=Dict, num_topics=10)
print("Model_2 Training done.")
print("Topics found via Gensim LDA:")
# topic & keywords & weight(importance)
pprint(G_LDA.print_topics())
print("_________________________________________________")
# Model_2 evaluation with Coherence Score #
cm = CoherenceModel(model=G_LDA, corpus=Corpus, coherence='u_mass')
coherence = cm.get_coherence()
print("Coherence Score for =", coherence)
def calc_coherence_values(dictionary, corpus, end, start, step):
coherence_values = []
for num_topics in range(start, end, step):
model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
coherence_model = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
coherence_values.append(coherence_model.get_coherence())
return coherence_values
coherence_list = calc_coherence_values(dictionary=Dict, corpus=Corpus, start=2, end=40, step=6)
End = 40
Start = 2
Step = 6
x = range(Start, End, Step)
plt.plot(x, coherence_list)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.show()
print("_________________________________________________")