the-deep-nlp · sudan45 · Mar 26, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 12, 2024
diff --git a/handlers/ecs/topicmodeling/Dockerfile b/handlers/ecs/topicmodeling/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8-slim-buster
+FROM python:3.10-slim-buster
 
 LABEL maintainer="[email protected]"
 

diff --git a/handlers/ecs/topicmodeling/app.py b/handlers/ecs/topicmodeling/app.py
@@ -12,7 +12,9 @@
 from fastapi import FastAPI, BackgroundTasks
 from pydantic import BaseModel
 from botocore.exceptions import ClientError
+import mapply
 from topic_generator import TopicGenerator
+from topic_generator_llm import TopicGenerationLLM
 from nlp_modules_utils import (
     Database,
     StateHandler,
@@ -25,6 +27,7 @@
 )
 
 logging.getLogger().setLevel(logging.INFO)
+mapply.init(chunk_size=1, progressbar=False)
 
 SENTRY_DSN = os.environ.get("SENTRY_DSN")
 ENVIRONMENT = os.environ.get("ENVIRONMENT")
@@ -38,7 +41,7 @@ class RequestSchema(BaseModel):
     callback_url: str
     max_cluster_num: Optional[int] = 10
     cluster_size: Optional[int] = 200
-    umap_components: Optional[int] = 24
+    umap_components: Optional[int] = 3
 
 ecs_app = FastAPI()
 
@@ -154,7 +157,7 @@ def _get_embeddings(
         finetuned_task: str = "['first_level_tags']",
         return_type: str = "default_analyis",
         embeddings_return_type: str = "array",
-        batch_size: int = 10
+        batch_size: int = 25
     ):
         """
         Calculates the embeddings of the entries
@@ -223,9 +226,26 @@ def select_most_relevant_excerpts(self, df):
         """
         Select only the most relevant excerpts if it exceeds the cluster size
         """
-        df.set_index("entry_id", inplace=True)
-        df_per_topic_nlargest = df.groupby("Topic")["Probability"].nlargest(self.cluster_size).reset_index()
-        return df_per_topic_nlargest.groupby("Topic")["entry_id"].apply(list).to_dict()
+        df_per_topic_nlargest = df.groupby("Topic").apply(pd.DataFrame.nlargest, n=self.cluster_size, columns='Probability').reset_index(drop=True)
+        df_per_topic_nlargest["Representation"] = df_per_topic_nlargest.Representation.apply(", ".join)
+        data_json = json.loads(
+            df_per_topic_nlargest.groupby("Topic")[["Document", "Representation", "entry_id"]]
+            .apply(lambda x: x.to_dict('list')).to_json()
+        )
+        for v in data_json.values():
+            v["Representation"] = " ".join(set(v["Representation"]))
+        new_df = pd.DataFrame.from_dict(data_json, orient="index")
+        new_df["label"] = new_df.mapply(self.generate_llm_topic, axis=1)
+        new_df.drop(columns=["Representation", "Document"], inplace=True)
+        return new_df.to_dict(orient="index")
+
+    def generate_llm_topic(self, x: pd.DataFrame, max_excerpts: int=20):
+        """
+        Generate the short topic using LLM based on keywords
+        The excerpts are restricted to first 20 (default)
+        """
+        topic_generation = TopicGenerationLLM(x["Document"][:max_excerpts], x["Representation"])
+        return topic_generation.topic_generator_handler()
 
     def dispatch_results(self, status, presigned_url=None):
         """
@@ -305,6 +325,7 @@ def __call__(self):
                     topics_dict = self.select_most_relevant_excerpts(df_merged)
             else:
                 topics_dict = {}
+
             date_today = date.today().isoformat()
             try:
                 presigned_url = upload_to_s3(

diff --git a/handlers/ecs/topicmodeling/poetry.lock b/handlers/ecs/topicmodeling/poetry.lock
diff --git a/handlers/ecs/topicmodeling/pyproject.toml b/handlers/ecs/topicmodeling/pyproject.toml
@@ -5,13 +5,18 @@ description = ""
 authors = ["dev <[email protected]"]
 
 [tool.poetry.dependencies]
-python = ">=3.8"
+python = ">=3.9,<4.0"
 pillow = "==9.0.0"
 fastapi = "^0.95.1"
 uvicorn = "^0.22.0"
 sentry-sdk = "==1.5.8"
-topic_generator = { git = "https://github.com/the-deep-nlp/topic-generator", rev = "534d857", branch = "main" }
+openai = ">=0.27.8"
+langchain = ">=0.0.238"
+tiktoken = ">=0.4.0"
+langchain-openai = "^0.0.2.post1"
+topic_generator = { git = "https://github.com/the-deep-nlp/topic-generator", rev = "b0d5d18", branch = "main" }
 nlp_modules_utils = { git = "https://github.com/the-deep-nlp/nlp-modules-utils.git", rev = "bc82d18", branch = "main" }
+mapply = "^0.1.25"
 
 [tool.poetry.dev-dependencies]
 

diff --git a/handlers/ecs/topicmodeling/topic_generator_llm.py b/handlers/ecs/topicmodeling/topic_generator_llm.py
@@ -0,0 +1,93 @@
+import os
+import re
+import boto3
+from langchain_openai.chat_models import ChatOpenAI
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain_community.callbacks import get_openai_callback
+
+from nlp_modules_utils import add_metric_data
+
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
+cloudwatch_client = boto3.client(
+    "cloudwatch",
+    region_name=os.environ.get("AWS_REGION", "us-east-1")
+)
+
+class TopicGenerationLLM:
+    """ Topic Generation using using OpenAI gpt-3.5
+        Url: https://platform.openai.com/docs/models/gpt-3-5
+    """
+    def __init__(self,
+        texts: list,
+        keywords: str,
+        model_name: str="gpt-3.5-turbo-1106",
+        temperature: float=0.2,
+        max_tokens: int=4096,
+        max_retries: int=2,
+        req_timeout: int=60
+    ):
+        self.texts = texts
+        self.keywords = keywords
+        self.llm = ChatOpenAI(
+            model_name=model_name,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_retries=max_retries,
+            request_timeout=req_timeout
+        )
+
+    def define_prompt_template(self):
+        """ Define the prompt template """
+
+        map_template = """
+        I have a topic that contains the following documents in a List.
+        {documents}
+        The topic is described by the following keywords: {keywords}
+        Based on the information about the topic above, please create a short label of this topic.
+        Make sure to only return the label and nothing more.
+        """
+
+        return map_template
+
+    def topic_generator_handler(self):
+        """ Topic generator """
+        generated_summary = None
+
+        map_template = self.define_prompt_template()
+
+        template_prompt = PromptTemplate.from_template(map_template)
+
+        summarize_chain = LLMChain(
+            llm=self.llm,
+            prompt=template_prompt
+        )
+
+        with get_openai_callback() as cb:
+            generated_summary = summarize_chain.invoke({
+                "documents": self.texts,
+                "keywords": self.keywords
+            })
+            summary_info = {
+                "total_tokens": cb.total_tokens,
+                "prompt_tokens": cb.prompt_tokens,
+                "completion_tokens": cb.completion_tokens,
+                "total_cost": round(cb.total_cost, 4),
+                "request_count": cb.successful_requests
+            }
+            for metric_name, metric_value in summary_info.items():
+                add_metric_data(
+                    cw_client=cloudwatch_client,
+                    metric_name=metric_name,
+                    metric_value=metric_value,
+                    dimension_name="Module",
+                    dimension_value="TopicModel",
+                    environment=ENVIRONMENT
+                )
+        return self.postprocess(generated_summary["text"])
+
+    def postprocess(self, texts: str):
+        """ Postprocess the generated text """
+        return re.sub(r'^[^A-Za-z0-9]+|[^A-Za-z0-9]+$', '', texts)
diff --git a/main.tf b/main.tf
@@ -194,6 +194,7 @@ module "topicmodel" {
   ssm_db_password_arn    = module.secrets.ssm_db_password_arn
   ssm_db_port_arn        = module.secrets.ssm_db_port_arn
   ssm_sentry_dsn_url_arn = module.secrets.ssm_sentry_dsn_url_arn
+  ssm_openai_api_key_arn = var.environment == "staging" ? module.secrets.ssm_topicmodel_openai_api_key_staging_arn : module.secrets.ssm_topicmodel_openai_api_key_prod_arn
 
   # db table
   db_table_name             = var.db_table_name

diff --git a/modules/ecsmodules/topicmodeling/ecs.tf b/modules/ecsmodules/topicmodeling/ecs.tf
@@ -80,6 +80,10 @@ resource "aws_ecs_task_definition" "task-def" {
         {
           "name": "SENTRY_DSN",
           "valueFrom": "${var.ssm_sentry_dsn_url_arn}"
+        },
+        {
+          "name": "OPENAI_API_KEY",
+          "valueFrom": "${var.ssm_openai_api_key_arn}"
         }
       ]
   }

diff --git a/modules/ecsmodules/topicmodeling/variables.tf b/modules/ecsmodules/topicmodeling/variables.tf
@@ -51,6 +51,7 @@ variable "ssm_db_username_arn" {}
 variable "ssm_db_password_arn" {}
 variable "ssm_db_port_arn" {}
 variable "ssm_sentry_dsn_url_arn" {}
+variable "ssm_openai_api_key_arn" {}
 
 # db table
 variable "db_table_name" {}

diff --git a/modules/secrets/outputs.tf b/modules/secrets/outputs.tf
@@ -80,4 +80,12 @@ output "ssm_openai_api_key_staging_arn" {
 
 output "ssm_openai_api_key_prod_arn" {
   value = data.aws_ssm_parameter.openai_api_key_prod.arn
+}
+
+output "ssm_topicmodel_openai_api_key_staging_arn" {
+  value = data.aws_ssm_parameter.topicmodel_openai_api_key_staging.arn
+}
+
+output "ssm_topicmodel_openai_api_key_prod_arn" {
+  value = data.aws_ssm_parameter.topicmodel_openai_api_key_prod.arn
 }
diff --git a/modules/secrets/ssm.tf b/modules/secrets/ssm.tf
@@ -68,4 +68,12 @@ data "aws_ssm_parameter" "openai_api_key_staging" {
 
 data "aws_ssm_parameter" "openai_api_key_prod" {
   name = "openai_api_key_prod"
+}
+
+data "aws_ssm_parameter" "topicmodel_openai_api_key_staging" {
+  name = "topicmodel_openai_api_key_staging"
+}
+
+data "aws_ssm_parameter" "topicmodel_openai_api_key_prod" {
+  name = "topicmodel_openai_api_key_prod"
 }