-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcode_umap.py
53 lines (40 loc) · 1.61 KB
/
code_umap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from umap import UMAP
# Dataset from ArabGend: Gender analysis and inference on Arabic Twitter
data = pd.read_csv("arab_gen_twitter.csv")
data.head()
# shape
data.shape
data = data.dropna()
documents = data['text'].values
arabert = TransformerDocumentEmbeddings('aubmindlab/bert-base-arabertv02')
# Topic Modeling
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(language="arabic", low_memory=True ,calculate_probabilities=False, embedding_model=arabert, umap_model=umap_model)
topics, probs = topic_model.fit_transform(documents)
#extract most frequent topics
topic_model.get_topic_freq().head(5)
topic_model.get_topic(1)[:10]
texts = [[word for word in str(document).split()] for document in documents]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]
topics=[]
for i in topic_model.get_topics():
row=[]
topic= topic_model.get_topic(i)
for word in topic:
row.append(word[0])
topics.append(row)
# compute coherence score
cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)
# Visualize the topics
#topic_model.visualize_topics()