From 46d11871a96a845bbcb5abfd1f93cf662063b355 Mon Sep 17 00:00:00 2001 From: Jack Gibson Date: Sun, 26 May 2024 14:08:31 -0500 Subject: [PATCH] lda modeling --- civiclens/nlp/pipeline.py | 6 +- civiclens/nlp/topics.py | 57 +++++++++++++----- civiclens/utils/objects/custom_stopwords.pkl | Bin 0 -> 2374 bytes .../objects}/stop_words.pickle | Bin 4 files changed, 44 insertions(+), 19 deletions(-) create mode 100644 civiclens/utils/objects/custom_stopwords.pkl rename civiclens/{nlp/saved_models => utils/objects}/stop_words.pickle (100%) diff --git a/civiclens/nlp/pipeline.py b/civiclens/nlp/pipeline.py index ef468d6f..a46bd948 100644 --- a/civiclens/nlp/pipeline.py +++ b/civiclens/nlp/pipeline.py @@ -13,7 +13,7 @@ from civiclens.nlp.comments import get_doc_comments, rep_comment_analysis from civiclens.nlp.models import sentence_transformer, sentiment_pipeline from civiclens.nlp.tools import RepComments, sentiment_analysis -from civiclens.nlp.topics import FlanLabeler, HDAModel, topic_comment_analysis +from civiclens.nlp.topics import FlanLabeler, TopicModel, topic_comment_analysis from civiclens.utils.database_access import Database, pull_data, upload_comments @@ -142,7 +142,7 @@ def docs_have_titles(): ) # topic modeling - topic_model = HDAModel() + topic_model = TopicModel() comment_data = topic_comment_analysis( comment_data, model=topic_model, @@ -151,4 +151,4 @@ def docs_have_titles(): ) logger.info(f"Proccessed document: {doc_id}") - # upload_comments(Database(), comment_data) + upload_comments(Database(), comment_data) diff --git a/civiclens/nlp/topics.py b/civiclens/nlp/topics.py index 96615629..5361c94a 100644 --- a/civiclens/nlp/topics.py +++ b/civiclens/nlp/topics.py @@ -1,20 +1,20 @@ import pickle from collections import defaultdict -from functools import partial from pathlib import Path from typing import Callable import gensim.corpora as corpora from gensim.corpora import Dictionary -from gensim.models import HdpModel, Phrases +from gensim.models import LdaModel, Phrases from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import PromptTemplate +from textblob import TextBlob from transformers import pipeline from civiclens.nlp.models import title_model, title_tokenizer from civiclens.nlp.tools import Comment, RepComments -from civiclens.utils.text import clean_text, regex_tokenize +from civiclens.utils.text import clean_text def stopwords(model_path: Path) -> set[str]: @@ -33,18 +33,20 @@ def stopwords(model_path: Path) -> set[str]: return stop_words -class HDAModel: +class TopicModel: """ Peforms LDA topic modeling """ def __init__(self): self.model = None - self.tokenizer = partial(regex_tokenize, pattern=r"\W+") + # self.tokenizer = partial(regex_tokenize, pattern=r"\W+") self.stop_words = stopwords( - Path(__file__).resolve().parent / "saved_models/stop_words.pickle" + Path(__file__).resolve().parent.parent + / "utils/objects/custom_stopwords.pkl" ) self.terms = None + self.pos_tags = {"NN", "NNS", "NNP", "NNPS"} def _process_text( self, comments: list[Comment] @@ -55,16 +57,17 @@ def _process_text( docs = [] document_ids = {} for idx, comment in enumerate(comments): - docs.append(self.tokenizer(clean_text(comment.text).lower())) + docs.append(clean_text(comment.text).lower()) document_ids[idx] = comment.id # remove numbers, 2 character tokens, and stop words docs = [ [ token - for token in doc + for token, tag in TextBlob(doc).tags if not token.isnumeric() and len(token) > 2 + and tag in self.pos_tags and token not in self.stop_words ] for doc in docs @@ -97,7 +100,7 @@ def run_model(self, comments: list[Comment]): docs, document_id = self._process_text(comments) token_dict, corpus = self._create_corpus(docs) - hdp_model = HdpModel(corpus, token_dict) + hdp_model = LdaModel(corpus, id2word=token_dict, num_topics=15) numeric_topics = self._find_best_topic(hdp_model, corpus) comment_topics = {} @@ -115,7 +118,7 @@ def run_model(self, comments: list[Comment]): return comment_topics def _find_best_topic( - self, model: HdpModel, corpus: list[tuple] + self, model: LdaModel, corpus: list[tuple] ) -> dict[int, int]: """ Computes most probable topic per document @@ -187,9 +190,15 @@ def __init__(self) -> None: federal policy. Ensure the label accurately encompasses the main theme represented by all the input words. - Example: - Input words: ["healthcare", "insurance", "coverage", "affordable"] - Output label: "Affordable Healthcare Access" + Examples: + Input words: ["climate", "emissions", "renewable", "energy", "policy"] + Output label: "Climate Change and Renewable Energy Policy" + + Input words: ["tax", "reform", "income", "brackets", "reduction"] + Output label: "Income Tax Reform and Reduction" + + Input words: ["immigration", "policy", "border", "security", "visas"] + Output label: "Immigration Policy and Border Security" Now, generate a topic label for the following list of words: @@ -206,15 +215,30 @@ def __init__(self) -> None: self.hf_pipeline = HuggingFacePipeline(pipeline=self.pipe) self.parse = StrOutputParser() + def _clean_ouput(self, text: str) -> tuple[str]: + """ + Converts LLM output formatted as "Output label: red, blue" into list + of labels, ["Red", "Blue"] + """ + label_text = text.split(": ")[-1] + label_set = set(label_text.split(", ")) + + return tuple(label.title() for label in label_set) + def generate_label(self, summary, terms) -> str: + """ + Creates label for list of topic terms using FLAN + """ if summary: prompt = PromptTemplate.from_template(self.summary_template) chain = prompt | self.hf_pipeline | self.parse - return chain.invoke({"summary": summary, "words": terms}) + return self._clean_ouput( + chain.invoke({"summary": summary, "words": terms}) + ) prompt = PromptTemplate.from_template(self.no_summary_template) chain = prompt | self.hf_pipeline | self.parse - return chain.invoke({"words": terms}) + return self._clean_ouput(chain.invoke({"words": terms})) def label_topics( @@ -239,7 +263,7 @@ def label_topics( def topic_comment_analysis( comment_data: RepComments, - model: HDAModel = None, + model: TopicModel = None, labeler: FlanLabeler = None, sentiment_analyzer: Callable = None, ) -> RepComments: @@ -269,6 +293,7 @@ def topic_comment_analysis( comment_topics = model.run_model(comments) topic_terms = model.get_terms() + # pprint(topic_terms) topic_labels = label_topics(topic_terms, comment_data.summary, labeler) # filter out non_rep comments diff --git a/civiclens/utils/objects/custom_stopwords.pkl b/civiclens/utils/objects/custom_stopwords.pkl new file mode 100644 index 0000000000000000000000000000000000000000..023ff8ea9f653def226aed4b49393d07ae3d15cf GIT binary patch literal 2374 zcmY+GTXv*26h+fKimGxR!*ITO0vU|}Q7BCz76Q}$%K*Gq^5M)k`&^aXEB-K)r29JO z+$;X7{{7?fll}eu{`*^T_wR3~ZS-MHyeyMmX zT)LG*#T?(?p4_smaan`gSi0irp8WLU@m6i4+wc|l+sN}0_w{ot%p1m48@G`YSSjky zY4KecLgyw6Xi3b^*vi$+V+b?hmGA4b@)-QVYT7%zoDyerxE$ij|1sS4-}8Q2{mrcy zRHK*O)8uaPSjuUB?=e#Dh33#}CUg3~hp`09iHSX?L zNM>W%+bMJuW{|YXTq@yln09{sKOU@irq9yQYKDEy*jHgwz$PwXHcpEkd7b)wHF%YP z9Y9B3Iy%NkVb`>`$#+==D(AMkJ8rH9n;@6ZF3y|lKp*g=BJ+50T^AqkPoM8nZFAq# zmUY;Q(y9%HS=&)()ki^p{-A;52c+fW$A~$pDZ`VvGz?#YKQO7hGm$S5z&Ka-0c6!V zK?0YRjw5jK-zbiGg=Kp-oX8$4^!pXa69{2XM=Pok)u6{mGA4P_JxHb;MNbu9q=F)_ zQnxYk(S*)E2aD&b5lOD6c|Z&Q%cnAw%5dyk$g1H}%Rj ztHm)_Dd8qir8I4w``GQd)9aAwYg*sAT@>-Q9p2O{EO5Bby$ESOC4BkRt-@@mcFf+s z&}+5389F2@7*R}aiHinKpg>|&oBvMfr2$ZVvte8J(WowIYOzmIR}I9EQGTXBZkdph zxs5elkAKcQXdW~}nFLrCn#}ak;Md%l4Yd483mT}o=@0;PQrOJn9EsBTgc)v&64 zZM@*L+@hMDv6=-$WMyFm^+tlr~)+CsUdfl=fwbD{r;HAZQ75q<;k6dTLb?ph;|Evqeo6 zgZ~3PsiqmckWlV<+$wh<*BznhR2({OyvtWKRb4@lZkUa?6NAEVIa@(NBIj1pB;ySj zUIhbF&RmLN5f*elvO9i*;z<<8}2MejU!T