-
Notifications
You must be signed in to change notification settings - Fork 2
/
code_joint.py
59 lines (44 loc) · 2.12 KB
/
code_joint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pandas as pd
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
data = pd.read_csv("arab_gen_twitter.csv")
data.head()
# shape
data.shape
data = data.dropna()
documents = data['text'].values
arabert = TransformerDocumentEmbeddings('aubmindlab/bert-large-arabertv02-twitter')
# Topic Modeling
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(language="arabic", low_memory=True ,calculate_probabilities=False, embedding_model=arabert, hdbscan_model=hdbscan_model, umap_model=umap_model)
#umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
#model = BERTopic(language="arabic", low_memory=True ,calculate_probabilities=False, embedding_model=arabert, umap_model=umap_model)
topics, probs = topic_model.fit_transform(documents)
#extract most frequent topics
topic_model.get_topic_freq().head(5)
topic_model.get_topic(1)[:10]
texts = [[word for word in str(document).split()] for document in documents]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]
topics=[]
for i in topic_model.get_topics():
row=[]
topic= topic_model.get_topic(i)
for word in topic:
row.append(word[0])
topics.append(row)
# compute coherence score
cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_v')
#cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='u_mass')
#cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)