Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Topic Generation #39

Merged
merged 6 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion handlers/ecs/topicmodeling/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.8-slim-buster
FROM python:3.10-slim-buster

LABEL maintainer="[email protected]"

Expand Down
31 changes: 26 additions & 5 deletions handlers/ecs/topicmodeling/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from botocore.exceptions import ClientError
import mapply
from topic_generator import TopicGenerator
from topic_generator_llm import TopicGenerationLLM
from nlp_modules_utils import (
Database,
StateHandler,
Expand All @@ -25,6 +27,7 @@
)

logging.getLogger().setLevel(logging.INFO)
mapply.init(chunk_size=1, progressbar=False)

SENTRY_DSN = os.environ.get("SENTRY_DSN")
ENVIRONMENT = os.environ.get("ENVIRONMENT")
Expand All @@ -38,7 +41,7 @@ class RequestSchema(BaseModel):
callback_url: str
max_cluster_num: Optional[int] = 10
cluster_size: Optional[int] = 200
umap_components: Optional[int] = 24
umap_components: Optional[int] = 3

ecs_app = FastAPI()

Expand Down Expand Up @@ -154,7 +157,7 @@ def _get_embeddings(
finetuned_task: str = "['first_level_tags']",
return_type: str = "default_analyis",
embeddings_return_type: str = "array",
batch_size: int = 10
batch_size: int = 25
):
"""
Calculates the embeddings of the entries
Expand Down Expand Up @@ -223,9 +226,26 @@ def select_most_relevant_excerpts(self, df):
"""
Select only the most relevant excerpts if it exceeds the cluster size
"""
df.set_index("entry_id", inplace=True)
df_per_topic_nlargest = df.groupby("Topic")["Probability"].nlargest(self.cluster_size).reset_index()
return df_per_topic_nlargest.groupby("Topic")["entry_id"].apply(list).to_dict()
df_per_topic_nlargest = df.groupby("Topic").apply(pd.DataFrame.nlargest, n=self.cluster_size, columns='Probability').reset_index(drop=True)
df_per_topic_nlargest["Representation"] = df_per_topic_nlargest.Representation.apply(", ".join)
data_json = json.loads(
df_per_topic_nlargest.groupby("Topic")[["Document", "Representation", "entry_id"]]
.apply(lambda x: x.to_dict('list')).to_json()
)
for v in data_json.values():
v["Representation"] = " ".join(set(v["Representation"]))
new_df = pd.DataFrame.from_dict(data_json, orient="index")
new_df["label"] = new_df.mapply(self.generate_llm_topic, axis=1)
new_df.drop(columns=["Representation", "Document"], inplace=True)
return new_df.to_dict(orient="index")

def generate_llm_topic(self, x: pd.DataFrame, max_excerpts: int=20):
"""
Generate the short topic using LLM based on keywords
The excerpts are restricted to first 20 (default)
"""
topic_generation = TopicGenerationLLM(x["Document"][:max_excerpts], x["Representation"])
return topic_generation.topic_generator_handler()

def dispatch_results(self, status, presigned_url=None):
"""
Expand Down Expand Up @@ -305,6 +325,7 @@ def __call__(self):
topics_dict = self.select_most_relevant_excerpts(df_merged)
else:
topics_dict = {}

date_today = date.today().isoformat()
try:
presigned_url = upload_to_s3(
Expand Down
2,078 changes: 1,648 additions & 430 deletions handlers/ecs/topicmodeling/poetry.lock

Large diffs are not rendered by default.

9 changes: 7 additions & 2 deletions handlers/ecs/topicmodeling/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@ description = ""
authors = ["dev <[email protected]"]

[tool.poetry.dependencies]
python = ">=3.8"
python = ">=3.9,<4.0"
pillow = "==9.0.0"
fastapi = "^0.95.1"
uvicorn = "^0.22.0"
sentry-sdk = "==1.5.8"
topic_generator = { git = "https://github.com/the-deep-nlp/topic-generator", rev = "534d857", branch = "main" }
openai = ">=0.27.8"
langchain = ">=0.0.238"
tiktoken = ">=0.4.0"
langchain-openai = "^0.0.2.post1"
topic_generator = { git = "https://github.com/the-deep-nlp/topic-generator", rev = "b0d5d18", branch = "main" }
nlp_modules_utils = { git = "https://github.com/the-deep-nlp/nlp-modules-utils.git", rev = "bc82d18", branch = "main" }
mapply = "^0.1.25"

[tool.poetry.dev-dependencies]

Expand Down
93 changes: 93 additions & 0 deletions handlers/ecs/topicmodeling/topic_generator_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import re
import boto3
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback

from nlp_modules_utils import add_metric_data

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ENVIRONMENT = os.environ.get("ENVIRONMENT")

cloudwatch_client = boto3.client(
"cloudwatch",
region_name=os.environ.get("AWS_REGION", "us-east-1")
)

class TopicGenerationLLM:
""" Topic Generation using using OpenAI gpt-3.5
Url: https://platform.openai.com/docs/models/gpt-3-5
"""
def __init__(self,
texts: list,
keywords: str,
model_name: str="gpt-3.5-turbo-1106",
temperature: float=0.2,
max_tokens: int=4096,
max_retries: int=2,
req_timeout: int=60
):
self.texts = texts
self.keywords = keywords
self.llm = ChatOpenAI(
model_name=model_name,
temperature=temperature,
max_tokens=max_tokens,
max_retries=max_retries,
request_timeout=req_timeout
)

def define_prompt_template(self):
""" Define the prompt template """

map_template = """
I have a topic that contains the following documents in a List.
{documents}
The topic is described by the following keywords: {keywords}
Based on the information about the topic above, please create a short label of this topic.
Make sure to only return the label and nothing more.
"""

return map_template

def topic_generator_handler(self):
""" Topic generator """
generated_summary = None

map_template = self.define_prompt_template()

template_prompt = PromptTemplate.from_template(map_template)

summarize_chain = LLMChain(
llm=self.llm,
prompt=template_prompt
)

with get_openai_callback() as cb:
generated_summary = summarize_chain.invoke({
"documents": self.texts,
"keywords": self.keywords
})
summary_info = {
"total_tokens": cb.total_tokens,
"prompt_tokens": cb.prompt_tokens,
"completion_tokens": cb.completion_tokens,
"total_cost": round(cb.total_cost, 4),
"request_count": cb.successful_requests
}
for metric_name, metric_value in summary_info.items():
add_metric_data(
cw_client=cloudwatch_client,
metric_name=metric_name,
metric_value=metric_value,
dimension_name="Module",
dimension_value="TopicModel",
environment=ENVIRONMENT
)
return self.postprocess(generated_summary["text"])

def postprocess(self, texts: str):
""" Postprocess the generated text """
return re.sub(r'^[^A-Za-z0-9]+|[^A-Za-z0-9]+$', '', texts)
1 change: 1 addition & 0 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ module "topicmodel" {
ssm_db_password_arn = module.secrets.ssm_db_password_arn
ssm_db_port_arn = module.secrets.ssm_db_port_arn
ssm_sentry_dsn_url_arn = module.secrets.ssm_sentry_dsn_url_arn
ssm_openai_api_key_arn = var.environment == "staging" ? module.secrets.ssm_topicmodel_openai_api_key_staging_arn : module.secrets.ssm_topicmodel_openai_api_key_prod_arn

# db table
db_table_name = var.db_table_name
Expand Down
4 changes: 4 additions & 0 deletions modules/ecsmodules/topicmodeling/ecs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ resource "aws_ecs_task_definition" "task-def" {
{
"name": "SENTRY_DSN",
"valueFrom": "${var.ssm_sentry_dsn_url_arn}"
},
{
"name": "OPENAI_API_KEY",
"valueFrom": "${var.ssm_openai_api_key_arn}"
}
]
}
Expand Down
1 change: 1 addition & 0 deletions modules/ecsmodules/topicmodeling/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ variable "ssm_db_username_arn" {}
variable "ssm_db_password_arn" {}
variable "ssm_db_port_arn" {}
variable "ssm_sentry_dsn_url_arn" {}
variable "ssm_openai_api_key_arn" {}

# db table
variable "db_table_name" {}
Expand Down
8 changes: 8 additions & 0 deletions modules/secrets/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,12 @@ output "ssm_openai_api_key_staging_arn" {

output "ssm_openai_api_key_prod_arn" {
value = data.aws_ssm_parameter.openai_api_key_prod.arn
}

output "ssm_topicmodel_openai_api_key_staging_arn" {
value = data.aws_ssm_parameter.topicmodel_openai_api_key_staging.arn
}

output "ssm_topicmodel_openai_api_key_prod_arn" {
value = data.aws_ssm_parameter.topicmodel_openai_api_key_prod.arn
}
8 changes: 8 additions & 0 deletions modules/secrets/ssm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,12 @@ data "aws_ssm_parameter" "openai_api_key_staging" {

data "aws_ssm_parameter" "openai_api_key_prod" {
name = "openai_api_key_prod"
}

data "aws_ssm_parameter" "topicmodel_openai_api_key_staging" {
name = "topicmodel_openai_api_key_staging"
}

data "aws_ssm_parameter" "topicmodel_openai_api_key_prod" {
name = "topicmodel_openai_api_key_prod"
}
Loading