Skip to content

Commit

Permalink
Refactor: moved newsletter_generation
Browse files Browse the repository at this point in the history
  • Loading branch information
picaultj committed Dec 31, 2024
1 parent d9c1664 commit 0055648
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
ExtractiveSummarizer,
EnhancedExtractiveSummarizer,
)
from bertrend_apps.newsletters.newsletter_features import generate_newsletter, md2html
from bertrend.llm_utils.newsletter_features import generate_newsletter, md2html


# Define summarizer options
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
from bertrend import LLM_CONFIG
from bertrend.llm_utils.openai_client import OpenAI_Client
from bertrend.topic_analysis.representative_docs import get_most_representative_docs
from bertrend_apps.newsletters.prompts import (
FR_USER_SUMMARY_MULTIPLE_DOCS,
EN_USER_SUMMARY_MULTIPLE_DOCS,
FR_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2,
EN_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2,
from bertrend.llm_utils.prompts import (
USER_SUMMARY_MULTIPLE_DOCS,
USER_GENERATE_TOPIC_LABEL_SUMMARIES,
)
from bertrend.services.summarizer import Summarizer
from bertrend.services.summary.abstractive_summarizer import AbstractiveSummarizer
Expand Down Expand Up @@ -131,11 +129,7 @@ def generate_newsletter(
article_list += f"Titre : {doc.title}\nContenu : {doc.text}\n\n"

topic_summary = openai_api.generate(
(
FR_USER_SUMMARY_MULTIPLE_DOCS
if prompt_language == "fr"
else EN_USER_SUMMARY_MULTIPLE_DOCS
).format(
(USER_SUMMARY_MULTIPLE_DOCS[prompt_language]).format(
keywords=", ".join(topics_info["Representation"].iloc[i]),
article_list=article_list,
nb_sentences=nb_sentences,
Expand All @@ -156,11 +150,7 @@ def generate_newsletter(
titles = [doc.title for _, doc in sub_df.iterrows()]

improved_topic_description_v2 = openai_api.generate(
(
FR_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2
if prompt_language == "fr"
else EN_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2
).format(
(USER_GENERATE_TOPIC_LABEL_SUMMARIES[prompt_language]).format(
newsletter_title=newsletter_title,
title_list=(
" ; ".join(summaries)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# See AUTHORS.txt
# SPDX-License-Identifier: MPL-2.0
# This file is part of BERTrend.
from bertrend.parameters import BERTOPIC_SERIALIZATION

FR_USER_SUMMARY_MULTIPLE_DOCS = (
"Vous êtes une IA hautement qualifiée, formée à la compréhension et à la synthèse du langage. "
Expand All @@ -27,7 +28,10 @@
)
# keywords: list of keywords describing the topic
# list of articles and their title

USER_SUMMARY_MULTIPLE_DOCS = {
"fr": FR_USER_SUMMARY_MULTIPLE_DOCS,
"en": EN_USER_SUMMARY_MULTIPLE_DOCS,
}

###################### TOPIC PROMPTS

Expand All @@ -37,6 +41,12 @@
'\n"{title_list}"'
)

EN_USER_GENERATE_TOPIC_LABEL_SUMMARIES = (
"Describe in a short sentence the topic associated with the following extracts. "
"The topic description should be short and specific, no more than 4 words. "
'\n"{title_list}"'
)

FR_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2 = (
'Dans le cadre de la génération d\'une newsletter sur le thème "{newsletter_title}", '
"décrit en une courte expression le sous-thème associé au texte suivant. "
Expand All @@ -48,12 +58,6 @@

# title_list: list of documents extracts belonging to the topic

EN_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2 = (
"Describe in a short sentence the topic associated with the following extracts. "
"The topic description should be short and specific, no more than 4 words. "
'\n"{title_list}"'
)

EN_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2 = (
'Within the framework of the generation of a newsletter on the topic "{newsletter_title}", '
"describe in a short sentence the sub-topic associated with the following text. "
Expand All @@ -64,6 +68,12 @@
)
# title_list: list of documents extracts belonging to the topic

USER_GENERATE_TOPIC_LABEL_SUMMARIES = {
"fr": FR_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2,
"en": EN_USER_GENERATE_TOPIC_LABEL_SUMMARIES_V2,
}


FR_USER_GENERATE_TOPIC_LABEL_TITLE = (
"Vous êtes une IA hautement qualifiée, formée à la compréhension et à la synthèse du langage. "
'Après utilisation d\'un algorithme de topic modelling, un topic est représenté par les mots-clé suivants : """{keywords}.""" '
Expand All @@ -74,11 +84,11 @@
# title_list: list of documents title belonging to the topic


FRENCH_TOPIC_REPRESENTATION_PROMPT = (
BERTOPIC_FRENCH_TOPIC_REPRESENTATION_PROMPT = (
"J'ai un topic qui contient les documents suivants :\n"
"[DOCUMENTS]\n"
"Le topic est décrit par les mots-clés suivants : [KEYWORDS]\n"
"Sur la base des informations ci-dessus, extraire une courte étiquette de topic dans le format suivant :\n"
"Topic : <étiquette du sujet>"
)
# Passed directly to BERTopic's OpenAI wrapper, formatted similar to BERTopic's original prompt which can be found in its source code
# Passed directly to BERTopic's OpenAI wrapper, formatted similarly to BERTopic's original prompt which can be found in its source code
4 changes: 2 additions & 2 deletions bertrend/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from bertrend.parameters import STOPWORDS
from bertrend.llm_utils.openai_client import OpenAI_Client
from bertrend.utils.data_loading import TEXT_COLUMN
from bertrend_apps.newsletters.prompts import FRENCH_TOPIC_REPRESENTATION_PROMPT
from bertrend.llm_utils.prompts import BERTOPIC_FRENCH_TOPIC_REPRESENTATION_PROMPT

from bertrend.utils.cache_utils import load_embeddings, save_embeddings, get_hash

Expand Down Expand Up @@ -345,7 +345,7 @@ def train_BERTopic(
model=os.environ["OPENAI_DEFAULT_MODEL_NAME"],
nr_docs=form_parameters["OpenAI_nr_docs"],
prompt=(
FRENCH_TOPIC_REPRESENTATION_PROMPT
BERTOPIC_FRENCH_TOPIC_REPRESENTATION_PROMPT
if form_parameters.get("OpenAI_language", "Français")
== "Français"
else None
Expand Down
2 changes: 1 addition & 1 deletion bertrend_apps/exploration/curebot/veille_analyse.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
)

from bertrend_apps.data_provider.curebot_provider import CurebotProvider
from bertrend_apps.newsletters.newsletter_features import generate_newsletter, md2html
from bertrend.llm_utils.newsletter_features import generate_newsletter, md2html

COLUMN_URL = "url"
MIN_TEXT_LENGTH = 150
Expand Down
2 changes: 1 addition & 1 deletion bertrend_apps/newsletters/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
TEXT_COLUMN,
load_data,
)
from bertrend_apps.newsletters.newsletter_features import (
from bertrend.llm_utils.newsletter_features import (
generate_newsletter,
export_md_string,
)
Expand Down

0 comments on commit 0055648

Please sign in to comment.