-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTopic_modeling.py
48 lines (39 loc) · 1.71 KB
/
Topic_modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
class Topic_modeling:
def __init__(self, *args, **kwargs):
pass
def read(self, path):
data = pd.read_pickle(path)
df = pd.DataFrame({"col":data})
return df
def tokenize(self, string):
tokens = word_tokenize(string)
tokens = " ".join(tokens)
tokens = re.sub("[\s]", " ", tokens)
tokens = re.sub('[/(){}\[\]\|@,;!٪×،*ـ+؟؛"" ... .. . <> _ - :]', " ", tokens)
tokens = re.sub('[!٬٫﷼٪×*)(ـ+}|؛؟<> ÷؛«» "" -]', " ", tokens)
clean_tokens = [w for w in tokens.split(" ") if not w in stopwords.words("persian")]
final_token = [w for w in clean_tokens if len(w)>2]
return final_token
def tfidf_vec(self, DF):
tfidf = TfidfVectorizer(max_df=.95, min_df=2, tokenizer=self.tokenize)
per_tfidf = tfidf.fit_transform(DF.col)
return per_tfidf, tfidf
def NMF(self, path, n_topic):
nmf_model = NMF(n_components=n_topic, random_state=42)
df = self.read(path)
per_tfidf, tfidf = self.tfidf_vec(df)
nmf_model.fit(per_tfidf)
l = []
for index,topic in enumerate(nmf_model.components_):
# print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
l.append([tfidf.get_feature_names()[i] for i in topic.argsort()[-18:]])
DF_words = pd.DataFrame({"Top words for topic":l})
topic_results = nmf_model.transform(per_tfidf)
df['topic'] = topic_results.argmax(axis=1)
return df, DF_words