From 46d11871a96a845bbcb5abfd1f93cf662063b355 Mon Sep 17 00:00:00 2001
From: Jack Gibson <jpgibson@uchicago.edu>
Date: Sun, 26 May 2024 14:08:31 -0500
Subject: [PATCH] lda modeling

---
 civiclens/nlp/pipeline.py                     |   6 +-
 civiclens/nlp/topics.py                       |  57 +++++++++++++-----
 civiclens/utils/objects/custom_stopwords.pkl  | Bin 0 -> 2374 bytes
 .../objects}/stop_words.pickle                | Bin
 4 files changed, 44 insertions(+), 19 deletions(-)
 create mode 100644 civiclens/utils/objects/custom_stopwords.pkl
 rename civiclens/{nlp/saved_models => utils/objects}/stop_words.pickle (100%)

diff --git a/civiclens/nlp/pipeline.py b/civiclens/nlp/pipeline.py
index ef468d6f..a46bd948 100644
--- a/civiclens/nlp/pipeline.py
+++ b/civiclens/nlp/pipeline.py
@@ -13,7 +13,7 @@
 from civiclens.nlp.comments import get_doc_comments, rep_comment_analysis
 from civiclens.nlp.models import sentence_transformer, sentiment_pipeline
 from civiclens.nlp.tools import RepComments, sentiment_analysis
-from civiclens.nlp.topics import FlanLabeler, HDAModel, topic_comment_analysis
+from civiclens.nlp.topics import FlanLabeler, TopicModel, topic_comment_analysis
 from civiclens.utils.database_access import Database, pull_data, upload_comments
 
 
@@ -142,7 +142,7 @@ def docs_have_titles():
         )
 
         # topic modeling
-        topic_model = HDAModel()
+        topic_model = TopicModel()
         comment_data = topic_comment_analysis(
             comment_data,
             model=topic_model,
@@ -151,4 +151,4 @@ def docs_have_titles():
         )
 
         logger.info(f"Proccessed document: {doc_id}")
-        # upload_comments(Database(), comment_data)
+        upload_comments(Database(), comment_data)
diff --git a/civiclens/nlp/topics.py b/civiclens/nlp/topics.py
index 96615629..5361c94a 100644
--- a/civiclens/nlp/topics.py
+++ b/civiclens/nlp/topics.py
@@ -1,20 +1,20 @@
 import pickle
 from collections import defaultdict
-from functools import partial
 from pathlib import Path
 from typing import Callable
 
 import gensim.corpora as corpora
 from gensim.corpora import Dictionary
-from gensim.models import HdpModel, Phrases
+from gensim.models import LdaModel, Phrases
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import PromptTemplate
+from textblob import TextBlob
 from transformers import pipeline
 
 from civiclens.nlp.models import title_model, title_tokenizer
 from civiclens.nlp.tools import Comment, RepComments
-from civiclens.utils.text import clean_text, regex_tokenize
+from civiclens.utils.text import clean_text
 
 
 def stopwords(model_path: Path) -> set[str]:
@@ -33,18 +33,20 @@ def stopwords(model_path: Path) -> set[str]:
     return stop_words
 
 
-class HDAModel:
+class TopicModel:
     """
     Peforms LDA topic modeling
     """
 
     def __init__(self):
         self.model = None
-        self.tokenizer = partial(regex_tokenize, pattern=r"\W+")
+        # self.tokenizer = partial(regex_tokenize, pattern=r"\W+")
         self.stop_words = stopwords(
-            Path(__file__).resolve().parent / "saved_models/stop_words.pickle"
+            Path(__file__).resolve().parent.parent
+            / "utils/objects/custom_stopwords.pkl"
         )
         self.terms = None
+        self.pos_tags = {"NN", "NNS", "NNP", "NNPS"}
 
     def _process_text(
         self, comments: list[Comment]
@@ -55,16 +57,17 @@ def _process_text(
         docs = []
         document_ids = {}
         for idx, comment in enumerate(comments):
-            docs.append(self.tokenizer(clean_text(comment.text).lower()))
+            docs.append(clean_text(comment.text).lower())
             document_ids[idx] = comment.id
 
         # remove numbers, 2 character tokens, and stop words
         docs = [
             [
                 token
-                for token in doc
+                for token, tag in TextBlob(doc).tags
                 if not token.isnumeric()
                 and len(token) > 2
+                and tag in self.pos_tags
                 and token not in self.stop_words
             ]
             for doc in docs
@@ -97,7 +100,7 @@ def run_model(self, comments: list[Comment]):
         docs, document_id = self._process_text(comments)
         token_dict, corpus = self._create_corpus(docs)
 
-        hdp_model = HdpModel(corpus, token_dict)
+        hdp_model = LdaModel(corpus, id2word=token_dict, num_topics=15)
         numeric_topics = self._find_best_topic(hdp_model, corpus)
 
         comment_topics = {}
@@ -115,7 +118,7 @@ def run_model(self, comments: list[Comment]):
         return comment_topics
 
     def _find_best_topic(
-        self, model: HdpModel, corpus: list[tuple]
+        self, model: LdaModel, corpus: list[tuple]
     ) -> dict[int, int]:
         """
         Computes most probable topic per document
@@ -187,9 +190,15 @@ def __init__(self) -> None:
         federal policy. Ensure the label accurately encompasses the main theme
         represented by all the input words.
 
-        Example:
-        Input words: ["healthcare", "insurance", "coverage", "affordable"]
-        Output label: "Affordable Healthcare Access"
+        Examples:
+        Input words: ["climate", "emissions", "renewable", "energy", "policy"]
+        Output label: "Climate Change and Renewable Energy Policy"
+
+        Input words: ["tax", "reform", "income", "brackets", "reduction"]
+        Output label: "Income Tax Reform and Reduction"
+
+        Input words: ["immigration", "policy", "border", "security", "visas"]
+        Output label: "Immigration Policy and Border Security"
 
         Now, generate a topic label for the following list of words:
 
@@ -206,15 +215,30 @@ def __init__(self) -> None:
         self.hf_pipeline = HuggingFacePipeline(pipeline=self.pipe)
         self.parse = StrOutputParser()
 
+    def _clean_ouput(self, text: str) -> tuple[str]:
+        """
+        Converts LLM output formatted as "Output label: red, blue" into list
+        of labels, ["Red", "Blue"]
+        """
+        label_text = text.split(": ")[-1]
+        label_set = set(label_text.split(", "))
+
+        return tuple(label.title() for label in label_set)
+
     def generate_label(self, summary, terms) -> str:
+        """
+        Creates label for list of topic terms using FLAN
+        """
         if summary:
             prompt = PromptTemplate.from_template(self.summary_template)
             chain = prompt | self.hf_pipeline | self.parse
-            return chain.invoke({"summary": summary, "words": terms})
+            return self._clean_ouput(
+                chain.invoke({"summary": summary, "words": terms})
+            )
 
         prompt = PromptTemplate.from_template(self.no_summary_template)
         chain = prompt | self.hf_pipeline | self.parse
-        return chain.invoke({"words": terms})
+        return self._clean_ouput(chain.invoke({"words": terms}))
 
 
 def label_topics(
@@ -239,7 +263,7 @@ def label_topics(
 
 def topic_comment_analysis(
     comment_data: RepComments,
-    model: HDAModel = None,
+    model: TopicModel = None,
     labeler: FlanLabeler = None,
     sentiment_analyzer: Callable = None,
 ) -> RepComments:
@@ -269,6 +293,7 @@ def topic_comment_analysis(
 
     comment_topics = model.run_model(comments)
     topic_terms = model.get_terms()
+    # pprint(topic_terms)
     topic_labels = label_topics(topic_terms, comment_data.summary, labeler)
 
     # filter out non_rep comments
diff --git a/civiclens/utils/objects/custom_stopwords.pkl b/civiclens/utils/objects/custom_stopwords.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..023ff8ea9f653def226aed4b49393d07ae3d15cf
GIT binary patch
literal 2374
zcmY+GTXv*26h+fKimGxR!*ITO0vU|}Q7BCz76Q}$%K*Gq^5M)k`&^aXEB-K)r29JO
z+$;X7{{7?fll}eu{`*^T_wR3~ZS-MHyeyM<iIuvG`_#X`T~e4kzxwI*{q5}MZi>mX
zT)LG*#T?(?p4_smaan`gSi0irp8WLU@m6i4+wc|l+sN}0_w{ot%p1m48@G`YSSjky
zY4KecLgyw6Xi3b^*vi$+V+b?hmGA4b@)-QVYT7%zoDyerxE$ij|1sS4-}8Q2{mrcy
zRHK*O)8ua<O>PSjuUB?=e#Dh33#}CUg3<kzbO(%H$($@(mjY##`&a%qIL_NOH7H2k
zuWnGiTi7PS6R~Z;4rG+{J_ma#4~Kmb*gcKzCCoQ$TpM?6e_MSQ?{`1<mK*1;y<gnA
zvBY*U;<uio`KgaFD2|@0|DrTJwX;^~3|nb4T28GWol3c^J}q%hqNhGG$5;6e9SBgw
z4`g1jFT*n!)|6!^CeWKEmYrBQi}#ou3k_IjFp4`E)?+-L6+s3nUF>~hp`09iHSX?L
zNM>W%+bMJuW{|YXTq@yln09{sKOU@irq9yQYKDEy*jHgwz$PwXHcpEkd7b)wHF%YP
z9Y9B3Iy%NkVb`>`$#+==D(AMkJ8rH9n;@6ZF3y|lKp*g=BJ+50T^AqkPoM8nZFAq#
zmUY;Q(y9%HS=&)()ki^p{-A;52c+fW$A~$pDZ`VvGz?#YKQO7hGm$S5z&Ka-0c6!V
zK?0YRjw5jK-zbiGg=Kp-oX8$4^!pXa69{2XM=Pok)u6{mGA4P_JxHb;MNbu9q=F)_
zQnx<qHq48PL||EX^}0{0uu$h$_n3Hm^p_FSkn!uYaxBevv2*&8Mrkp_Fgg`Bc_Xae
z=PgKKW+osT*Z07O)l--WdVh3r!<^cjlKT=sc<{b&UH6SRZU!%LeGe&F610rLbY^(b
zYvDTq2bu||r_RkZk40QHxmY3>Yk(S*)E2aD&b5lOD6c|Z&Q%cnAw%5dyk$g1H}%Rj
ztHm)_Dd8qir8I4w``GQd)9aAwYg*sAT@>-Q9p2O{EO5Bby$ESOC4BkRt-@@mcFf+s
z&}+5389F2@7*R}aiHinKpg>|&oBvMfr2$ZVvte8J(WowIYOzmIR}I9EQGTXBZkdph
zxs5elkAKcQXdW~}nFLrCn#}ak;Md%l4Yd483mT}o<X>=@0;PQrOJn9EsBTgc)v&64
zZM@*L+@hMDv6=-$WMyFm^<Gm=#3{9N21Vl-2}EUdH3sI4(`EJ@(sH^8PVUBqC!8%^
zEj${vf-=@iJV@v$?n>+tlr~)+CsUdfl=fwbD{r;HAZQ75q<;k6dTLb?ph;|Evqeo6
zgZ~3PsiqmckWlV<+$wh<*BznhR2({OyvtWKRb4@lZkUa?6NAEVIa@(NBIj1pB;ySj
zUIhbF&RmLN5<c7AU}i?7z^hW!$3kIqY#Sv6?ql6ROb?Y%e_Ri=5WiFj$*Kb$gC{Zg
zaiE)@!NZjRBC`}xY3@)**zzsHy(6^v$#DlPjfq#oL{wTZ(;(jr8l?zm*zKDEwdP`N
w+ph2CG1z~c4KMaVqv2`LYsBK2J6Ks1tw?87?EK&>f*elvO9i*;z<<8}2MejU!T<mO

literal 0
HcmV?d00001

diff --git a/civiclens/nlp/saved_models/stop_words.pickle b/civiclens/utils/objects/stop_words.pickle
similarity index 100%
rename from civiclens/nlp/saved_models/stop_words.pickle
rename to civiclens/utils/objects/stop_words.pickle