From 608d34c20a1155f19b578813ee70159621eecf7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 17 Oct 2024 01:37:50 +0200 Subject: [PATCH 01/19] update langchain keys and examples --- bertopic/representation/_langchain.py | 106 ++++++++++++-------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index df5c4839..720e7618 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -1,35 +1,22 @@ import pandas as pd -from langchain.docstore.document import Document +from langchain_core.documents import Document from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document -DEFAULT_PROMPT = "What are these documents about? Please give a single label." - class LangChain(BaseRepresentation): """Using chains in langchain to generate topic labels. - The classic example uses `langchain.chains.question_answering.load_qa_chain`. - This returns a chain that takes a list of documents and a question as input. - - You can also use Runnables such as those composed using the LangChain Expression Language. + You can use chains or Runnables such as those composed using the LangChain Expression Language + as long as their schema respects the conditions defined below. Arguments: chain: The langchain chain or Runnable with a `batch` method. - Input keys must be `input_documents` and `question`. - Output key must be `output_text`. - prompt: The prompt to be used in the model. If no prompt is given, - `self.default_prompt_` is used instead. - NOTE: Use `"[KEYWORDS]"` in the prompt - to decide where the keywords need to be - inserted. Keywords won't be included unless - indicated. Unlike other representation models, - Langchain does not use the `"[DOCUMENTS]"` tag - to insert documents into the prompt. The load_qa_chain function - formats the representative documents within the prompt. + Input keys must be `documents` (mandatory) and `keywords` (optional). + Output key must be `representation`. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -60,14 +47,30 @@ class LangChain(BaseRepresentation): like openai: `pip install langchain` - `pip install openai` + `pip install langchain_openai` Then, you can create your chain as follows: ```python - from langchain.chains.question_answering import load_qa_chain - from langchain.llms import OpenAI - chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") + from langchain_openai import ChatOpenAI + from langchain_core.prompts import ChatPromptTemplate + from langchain_core.runnables import RunnablePassthrough + from langchain.chains.combine_documents import create_stuff_documents_chain + + chat_model = ChatOpenAI(model=..., api_key=...) + + # For simple prompts + prompt = ChatPromptTemplate.from_template("What are these documents about? {documents}. Here are some keywords about them {keywords} Please give a single label.") + + # For multi-message prompts + prompt = ChatPromptTemplate.from_messages( + [ + ("system", "You are provided with a list of documents and are asked to provide a single label for the topic."), + ("human", "Here is the list of documents: {documents}"), + ] + ) + + chain = RunnablePassthrough.assign(representation=create_stuff_documents_chain(chat_model, prompt, document_variable_name="documents")) ``` Finally, you can pass the chain to BERTopic as follows: @@ -82,26 +85,21 @@ class LangChain(BaseRepresentation): topic_model = BERTopic(representation_model=representation_model) ``` - You can also use a custom prompt: - - ```python - prompt = "What are these documents about? Please give a single label." - representation_model = LangChain(chain, prompt=prompt) - ``` - You can also use a Runnable instead of a chain. The example below uses the LangChain Expression Language: ```python from bertopic.representation import LangChain - from langchain.chains.question_answering import load_qa_chain from langchain.chat_models import ChatAnthropic - from langchain.schema.document import Document - from langchain.schema.runnable import RunnablePassthrough + from langchain_core.documents import Document + from langchain_core.prompts import ChatPromptTemplate + from langchain_core.runnables import RunnablePassthrough + from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer prompt = ... - llm = ... + + chat_model = ... # We will construct a special privacy-preserving chain using Microsoft Presidio @@ -109,7 +107,7 @@ class LangChain(BaseRepresentation): chain = ( { - "input_documents": ( + "documents": ( lambda inp: [ Document( page_content=pii_handler.anonymize( @@ -117,23 +115,22 @@ class LangChain(BaseRepresentation): language="en", ), ) - for d in inp["input_documents"] + for d in inp["documents"] ] ), - "question": RunnablePassthrough(), + "keywords": RunnablePassthrough(), } - | load_qa_chain(representation_llm, chain_type="stuff") - | (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])}) + | create_stuff_documents_chain(chat_model, prompt, document_variable_name="documents") + | (lambda output: {"representation": pii_handler.deanonymize(output["representation"])}) ) - representation_model = LangChain(chain, prompt=representation_prompt) + representation_model = LangChain(chain) ``` """ def __init__( self, chain, - prompt: str = None, nr_docs: int = 4, diversity: float = None, doc_length: int = None, @@ -141,8 +138,6 @@ def __init__( chain_config=None, ): self.chain = chain - self.prompt = prompt if prompt is not None else DEFAULT_PROMPT - self.default_prompt_ = DEFAULT_PROMPT self.chain_config = chain_config self.nr_docs = nr_docs self.diversity = diversity @@ -186,24 +181,21 @@ def extract_topics( for docs in repr_docs_mappings.values() ] - # `self.chain` must take `input_documents` and `question` as input keys - # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS] - if "[KEYWORDS]" in self.prompt: - prompts = [] - for topic in topics: - keywords = list(zip(*topics[topic]))[0] - prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) - prompts.append(prompt) + # `self.chain` must take `documents` as a mandatory input key and `keywords` as an optional input key + formatted_keywords_list = [] + for topic in topics: + keywords = list(zip(*topics[topic]))[0] + formatted_keywords_list.append(", ".join(keywords)) - inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] - - else: - inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] + # Documents are passed as a list of langchain Document objects, it is up to the chain to format them into a str + inputs = [ + {"documents": docs, "keywords": formatted_keywords} + for docs, formatted_keywords in zip(chain_docs, formatted_keywords_list) + ] - # `self.chain` must return a dict with an `output_text` key - # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` + # `self.chain` must return a dict with an `representation` key outputs = self.chain.batch(inputs=inputs, config=self.chain_config) - labels = [output["output_text"].strip() for output in outputs] + labels = [output["representation"].strip() for output in outputs] updated_topics = { topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) From 8fed3763378389aa22275b44422959a4d8da8ec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 17 Oct 2024 01:38:01 +0200 Subject: [PATCH 02/19] update langchain docs for representation --- docs/getting_started/representation/llm.md | 30 ++++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/docs/getting_started/representation/llm.md b/docs/getting_started/representation/llm.md index ab538bd3..2b35571d 100644 --- a/docs/getting_started/representation/llm.md +++ b/docs/getting_started/representation/llm.md @@ -479,15 +479,23 @@ To use langchain, you will need to install the langchain package first. Addition like openai: ```bash -pip install langchain, openai +pip install langchain +pip langchain_openai ``` Then, you can create your chain as follows: ```python -from langchain.chains.question_answering import load_qa_chain -from langchain.llms import OpenAI -chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") +from langchain_openai import ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import RunnablePassthrough +from langchain.chains.combine_documents import create_stuff_documents_chain + +chat_model = ChatOpenAI(model=..., api_key=...) + +prompt = ChatPromptTemplate.from_template("What are these documents about? {documents}. Please give a single label.") + +chain = RunnablePassthrough.assign(representation=create_stuff_documents_chain(chat_model, prompt, document_variable_name="documents")) ``` Finally, you can pass the chain to BERTopic as follows: @@ -502,17 +510,17 @@ representation_model = LangChain(chain) topic_model = BERTopic(representation_model=representation_model) ``` -You can also use a custom prompt: +You can also customize the prompt, and include the optional `keywords` placeholder to add the keywords to the prompt. ```python -prompt = "What are these documents about? Please give a single label." -representation_model = LangChain(chain, prompt=prompt) +prompt = ChatPromptTemplate.from_messages( + [ + ("system", "You are provided with a list of documents and are asked to provide a single label for the topic."), + ("human", "Here is the list of documents: {documents} and related keywords: {keywords}"), + ] +) ``` -!!! note Note - The prompt does not make use of `[KEYWORDS]` and `[DOCUMENTS]` tags as - the documents are already used within langchain's `load_qa_chain`. - ## **Cohere** Instead of using a language model from 🤗 transformers, we can use external APIs instead that From 848a5e2b47114b1596a2d982b3664c5b9b46e9ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Tue, 22 Oct 2024 22:29:26 +0000 Subject: [PATCH 03/19] Rename keys, add default prompt, remove output key --- bertopic/representation/_langchain.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 720e7618..2fec0936 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -1,5 +1,7 @@ import pandas as pd from langchain_core.documents import Document +from langchain_core.prompts import ChatPromptTemplate + from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union @@ -7,6 +9,11 @@ from bertopic.representation._utils import truncate_document +DEFAULT_PROMPT = ChatPromptTemplate.from_template( + "What are these documents about? {DOCUMENTS} Here are keywords related to them {KEYWORDS}. Your output is a single label without any formatting." +) + + class LangChain(BaseRepresentation): """Using chains in langchain to generate topic labels. @@ -15,8 +22,7 @@ class LangChain(BaseRepresentation): Arguments: chain: The langchain chain or Runnable with a `batch` method. - Input keys must be `documents` (mandatory) and `keywords` (optional). - Output key must be `representation`. + Input keys must be `DOCUMENTS` (mandatory) and `KEYWORDS` (optional). nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -189,13 +195,13 @@ def extract_topics( # Documents are passed as a list of langchain Document objects, it is up to the chain to format them into a str inputs = [ - {"documents": docs, "keywords": formatted_keywords} + {"DOCUMENTS": docs, "KEYWORDS": formatted_keywords} for docs, formatted_keywords in zip(chain_docs, formatted_keywords_list) ] # `self.chain` must return a dict with an `representation` key outputs = self.chain.batch(inputs=inputs, config=self.chain_config) - labels = [output["representation"].strip() for output in outputs] + labels = [output.strip() for output in outputs] updated_topics = { topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) From 703d533aef7b030843b9b768df0174676dc8a0cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 15:15:45 +0100 Subject: [PATCH 04/19] Reverting to original implementation and adding a new default prompt --- bertopic/representation/_langchain.py | 135 ++++++++++++++++---------- 1 file changed, 82 insertions(+), 53 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 2fec0936..53033ffb 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -1,7 +1,5 @@ import pandas as pd -from langchain_core.documents import Document -from langchain_core.prompts import ChatPromptTemplate - +from langchain.docstore.document import Document from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union @@ -9,20 +7,56 @@ from bertopic.representation._utils import truncate_document -DEFAULT_PROMPT = ChatPromptTemplate.from_template( - "What are these documents about? {DOCUMENTS} Here are keywords related to them {KEYWORDS}. Your output is a single label without any formatting." -) +DEFAULT_PROMPT = """ +This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title +--- +Topic: +Sample texts from this topic: +- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. +- Meat, but especially beef, is the word food in terms of emissions. +- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. + +Keywords: meat beef eat eating emissions steak food health processed chicken +Topic name: Environmental impacts of eating meat +--- +Topic: +Sample texts from this topic: +- I have ordered the product weeks ago but it still has not arrived! +- The website mentions that it only takes a couple of days to deliver but I still have not received mine. +- I got a message stating that I received the monitor but that is not true! +- It took a month longer to deliver than was advised... + +Keywords: deliver weeks product shipping long delivery received arrived arrive week +Topic name: Shipping and delivery issues +--- +Topic: +Sample texts from this topic: +[DOCUMENTS] +Keywords: [KEYWORDS] +Topic name:""" class LangChain(BaseRepresentation): """Using chains in langchain to generate topic labels. - You can use chains or Runnables such as those composed using the LangChain Expression Language - as long as their schema respects the conditions defined below. + The classic example uses `langchain.chains.question_answering.load_qa_chain`. + This returns a chain that takes a list of documents and a question as input. + + You can also use Runnables such as those composed using the LangChain Expression Language. Arguments: chain: The langchain chain or Runnable with a `batch` method. - Input keys must be `DOCUMENTS` (mandatory) and `KEYWORDS` (optional). + Input keys must be `input_documents` and `question`. + Output key must be `output_text`. + prompt: The prompt to be used in the model. If no prompt is given, + `self.default_prompt_` is used instead. + NOTE: Use `"[KEYWORDS]"` in the prompt + to decide where the keywords need to be + inserted. Keywords won't be included unless + indicated. Unlike other representation models, + Langchain does not use the `"[DOCUMENTS]"` tag + to insert documents into the prompt. The load_qa_chain function + formats the representative documents within the prompt. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -53,30 +87,14 @@ class LangChain(BaseRepresentation): like openai: `pip install langchain` - `pip install langchain_openai` + `pip install openai` Then, you can create your chain as follows: ```python - from langchain_openai import ChatOpenAI - from langchain_core.prompts import ChatPromptTemplate - from langchain_core.runnables import RunnablePassthrough - from langchain.chains.combine_documents import create_stuff_documents_chain - - chat_model = ChatOpenAI(model=..., api_key=...) - - # For simple prompts - prompt = ChatPromptTemplate.from_template("What are these documents about? {documents}. Here are some keywords about them {keywords} Please give a single label.") - - # For multi-message prompts - prompt = ChatPromptTemplate.from_messages( - [ - ("system", "You are provided with a list of documents and are asked to provide a single label for the topic."), - ("human", "Here is the list of documents: {documents}"), - ] - ) - - chain = RunnablePassthrough.assign(representation=create_stuff_documents_chain(chat_model, prompt, document_variable_name="documents")) + from langchain.chains.question_answering import load_qa_chain + from langchain.llms import OpenAI + chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") ``` Finally, you can pass the chain to BERTopic as follows: @@ -91,21 +109,26 @@ class LangChain(BaseRepresentation): topic_model = BERTopic(representation_model=representation_model) ``` + You can also use a custom prompt: + + ```python + prompt = "What are these documents about? Please give a single label." + representation_model = LangChain(chain, prompt=prompt) + ``` + You can also use a Runnable instead of a chain. The example below uses the LangChain Expression Language: ```python from bertopic.representation import LangChain + from langchain.chains.question_answering import load_qa_chain from langchain.chat_models import ChatAnthropic - from langchain_core.documents import Document - from langchain_core.prompts import ChatPromptTemplate - from langchain_core.runnables import RunnablePassthrough - from langchain.chains.combine_documents import create_stuff_documents_chain + from langchain.schema.document import Document + from langchain.schema.runnable import RunnablePassthrough from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer prompt = ... - - chat_model = ... + llm = ... # We will construct a special privacy-preserving chain using Microsoft Presidio @@ -113,7 +136,7 @@ class LangChain(BaseRepresentation): chain = ( { - "documents": ( + "input_documents": ( lambda inp: [ Document( page_content=pii_handler.anonymize( @@ -121,22 +144,23 @@ class LangChain(BaseRepresentation): language="en", ), ) - for d in inp["documents"] + for d in inp["input_documents"] ] ), - "keywords": RunnablePassthrough(), + "question": RunnablePassthrough(), } - | create_stuff_documents_chain(chat_model, prompt, document_variable_name="documents") - | (lambda output: {"representation": pii_handler.deanonymize(output["representation"])}) + | load_qa_chain(representation_llm, chain_type="stuff") + | (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])}) ) - representation_model = LangChain(chain) + representation_model = LangChain(chain, prompt=representation_prompt) ``` """ def __init__( self, chain, + prompt: str = DEFAULT_PROMPT, nr_docs: int = 4, diversity: float = None, doc_length: int = None, @@ -144,6 +168,8 @@ def __init__( chain_config=None, ): self.chain = chain + self.prompt = prompt + self.default_prompt_ = DEFAULT_PROMPT self.chain_config = chain_config self.nr_docs = nr_docs self.diversity = diversity @@ -187,21 +213,24 @@ def extract_topics( for docs in repr_docs_mappings.values() ] - # `self.chain` must take `documents` as a mandatory input key and `keywords` as an optional input key - formatted_keywords_list = [] - for topic in topics: - keywords = list(zip(*topics[topic]))[0] - formatted_keywords_list.append(", ".join(keywords)) + # `self.chain` must take `input_documents` and `question` as input keys + # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS] + if "[KEYWORDS]" in self.prompt: + prompts = [] + for topic in topics: + keywords = list(zip(*topics[topic]))[0] + prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) + prompts.append(prompt) - # Documents are passed as a list of langchain Document objects, it is up to the chain to format them into a str - inputs = [ - {"DOCUMENTS": docs, "KEYWORDS": formatted_keywords} - for docs, formatted_keywords in zip(chain_docs, formatted_keywords_list) - ] + inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] + + else: + inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] - # `self.chain` must return a dict with an `representation` key + # `self.chain` must return a dict with an `output_text` key + # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` outputs = self.chain.batch(inputs=inputs, config=self.chain_config) - labels = [output.strip() for output in outputs] + labels = [output["output_text"].strip() for output in outputs] updated_topics = { topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) From d57366ab0e8c5352f77c7514d8041e35651be6f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 15:21:29 +0100 Subject: [PATCH 05/19] Update prompt docstring --- bertopic/representation/_langchain.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 53033ffb..59948b04 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -48,15 +48,10 @@ class LangChain(BaseRepresentation): chain: The langchain chain or Runnable with a `batch` method. Input keys must be `input_documents` and `question`. Output key must be `output_text`. - prompt: The prompt to be used in the model. If no prompt is given, - `self.default_prompt_` is used instead. - NOTE: Use `"[KEYWORDS]"` in the prompt - to decide where the keywords need to be - inserted. Keywords won't be included unless - indicated. Unlike other representation models, - Langchain does not use the `"[DOCUMENTS]"` tag - to insert documents into the prompt. The load_qa_chain function - formats the representative documents within the prompt. + prompt: A string containing placeholders `[DOCUMENTS]` and `[KEYWORDS]` that will be + replaced with the actual documents and keywords during processing. If not provided, + the default prompt defined in DEFAULT_PROMPT will be used. Note that the prompt is + only used in the basic LangChain stuff documents chain (used when `llm` is provided). nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher From 1f3a721542c990528252974ac283841a80d16c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 15:23:23 +0100 Subject: [PATCH 06/19] Remove self.default_prompt and add type to chain_config --- bertopic/representation/_langchain.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 59948b04..a7a29062 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -48,9 +48,9 @@ class LangChain(BaseRepresentation): chain: The langchain chain or Runnable with a `batch` method. Input keys must be `input_documents` and `question`. Output key must be `output_text`. - prompt: A string containing placeholders `[DOCUMENTS]` and `[KEYWORDS]` that will be - replaced with the actual documents and keywords during processing. If not provided, - the default prompt defined in DEFAULT_PROMPT will be used. Note that the prompt is + prompt: A string containing placeholders `[DOCUMENTS]` and `[KEYWORDS]` that will be + replaced with the actual documents and keywords during processing. If not provided, + the default prompt defined in DEFAULT_PROMPT will be used. Note that the prompt is only used in the basic LangChain stuff documents chain (used when `llm` is provided). nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. @@ -160,16 +160,15 @@ def __init__( diversity: float = None, doc_length: int = None, tokenizer: Union[str, Callable] = None, - chain_config=None, + chain_config: dict = None, ): self.chain = chain self.prompt = prompt - self.default_prompt_ = DEFAULT_PROMPT - self.chain_config = chain_config self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + self.chain_config = chain_config def extract_topics( self, From b687aca58230e086dc9fb71963502dfd154d0390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 15:34:19 +0100 Subject: [PATCH 07/19] Add an llm argument and docstring --- bertopic/representation/_langchain.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index a7a29062..481d9980 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -2,6 +2,8 @@ from langchain.docstore.document import Document from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union +from langchain_core.language_models import LanguageModelLike +from langchain_core.runnables import Runnable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document @@ -45,13 +47,17 @@ class LangChain(BaseRepresentation): You can also use Runnables such as those composed using the LangChain Expression Language. Arguments: - chain: The langchain chain or Runnable with a `batch` method. - Input keys must be `input_documents` and `question`. - Output key must be `output_text`. + llm: The language model to use for creating a basic langchain chain. + This parameter is used to create a default chain if no custom chain + is provided. If a custom chain is provided via the `chain` parameter, + this parameter is ignored. prompt: A string containing placeholders `[DOCUMENTS]` and `[KEYWORDS]` that will be replaced with the actual documents and keywords during processing. If not provided, the default prompt defined in DEFAULT_PROMPT will be used. Note that the prompt is only used in the basic LangChain stuff documents chain (used when `llm` is provided). + chain: The langchain chain or Runnable with a `batch` method. + Input keys must be `input_documents` and `question`. + Output key must be `output_text`. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -154,16 +160,18 @@ class LangChain(BaseRepresentation): def __init__( self, - chain, + llm: LanguageModelLike = None, prompt: str = DEFAULT_PROMPT, + chain: Runnable = None, nr_docs: int = 4, diversity: float = None, doc_length: int = None, tokenizer: Union[str, Callable] = None, chain_config: dict = None, ): - self.chain = chain + self.llm = llm self.prompt = prompt + self.chain = chain self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length From 5ab66e8e1d8076943e21b9ff8adf60960a1ae10d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 15:42:48 +0100 Subject: [PATCH 08/19] Add logic to create a basic chain or custom chain --- bertopic/representation/_langchain.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 481d9980..de88eece 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -4,7 +4,8 @@ from typing import Callable, Mapping, List, Tuple, Union from langchain_core.language_models import LanguageModelLike from langchain_core.runnables import Runnable - +from langchain_core.prompts import ChatPromptTemplate +from langchain.chains.combine_documents import create_stuff_documents_chain from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document @@ -55,9 +56,10 @@ class LangChain(BaseRepresentation): replaced with the actual documents and keywords during processing. If not provided, the default prompt defined in DEFAULT_PROMPT will be used. Note that the prompt is only used in the basic LangChain stuff documents chain (used when `llm` is provided). - chain: The langchain chain or Runnable with a `batch` method. - Input keys must be `input_documents` and `question`. - Output key must be `output_text`. + chain: A custom LangChain chain to be used instead of the basic LangChain stuff documents chain that + is created using `llm` and `prompt`. The chain must accept the input key `DOCUMENTS` and optionally + the input key `KEYWORDS`. It should output either a string or a list directly (not as a dict). + If provided, `llm` and `prompt` are ignored. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -169,9 +171,20 @@ def __init__( tokenizer: Union[str, Callable] = None, chain_config: dict = None, ): - self.llm = llm - self.prompt = prompt - self.chain = chain + if chain is not None: + self.chain = chain + elif llm is not None: + # Convert prompt placeholders to the LangChain format + langchain_prompt = prompt.replace("[DOCUMENTS]", "{DOCUMENTS}").replace("[KEYWORDS]", "{KEYWORDS}") + + # Create ChatPromptTemplate + chat_prompt = ChatPromptTemplate.from_template(langchain_prompt) + + # Create a basic LangChain chain using create_stuff_documents_chain + self.chain = create_stuff_documents_chain(llm, chat_prompt, document_variable_name="DOCUMENTS") + else: + raise ValueError("Either `llm` or `chain` must be provided") + self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length From 97a8125b25523bebc094975bdf077e2baba18e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 19:25:03 +0100 Subject: [PATCH 09/19] Improve docstring and examples for basic and advanced usage --- bertopic/representation/_langchain.py | 146 ++++++++++++-------------- 1 file changed, 67 insertions(+), 79 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index de88eece..9c2c85ef 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -1,5 +1,5 @@ import pandas as pd -from langchain.docstore.document import Document +from langchain_core.documents import Document from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union from langchain_core.language_models import LanguageModelLike @@ -9,7 +9,6 @@ from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document - DEFAULT_PROMPT = """ This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title --- @@ -40,26 +39,21 @@ class LangChain(BaseRepresentation): - """Using chains in langchain to generate topic labels. - - The classic example uses `langchain.chains.question_answering.load_qa_chain`. - This returns a chain that takes a list of documents and a question as input. - - You can also use Runnables such as those composed using the LangChain Expression Language. + """This representation model uses LangChain to generate descriptive topic labels. It supports two main usage patterns. + 1. Basic usage with a language model and optional custom prompt + 2. Advanced usage with a custom LangChain chain for full control over the generation process Arguments: - llm: The language model to use for creating a basic langchain chain. - This parameter is used to create a default chain if no custom chain - is provided. If a custom chain is provided via the `chain` parameter, - this parameter is ignored. - prompt: A string containing placeholders `[DOCUMENTS]` and `[KEYWORDS]` that will be - replaced with the actual documents and keywords during processing. If not provided, - the default prompt defined in DEFAULT_PROMPT will be used. Note that the prompt is - only used in the basic LangChain stuff documents chain (used when `llm` is provided). - chain: A custom LangChain chain to be used instead of the basic LangChain stuff documents chain that - is created using `llm` and `prompt`. The chain must accept the input key `DOCUMENTS` and optionally - the input key `KEYWORDS`. It should output either a string or a list directly (not as a dict). - If provided, `llm` and `prompt` are ignored. + llm: A LangChain text model or chat model used to generate representations, only needed for basic usage. + Examples include ChatOpenAI or ChatAnthropic. Ignored if a custom chain is provided. + prompt: A string template containing the placeholder [DOCUMENTS] and optionally [KEYWORDS], only needed for basic usage. + Defaults to a pre-defined prompt defined in DEFAULT_PROMPT. Ignored if a custom chain is provided. + chain: A custom LangChain chain to generate representations, only needed for advanced usage. + The chain must be a LangChain Runnable that implements the batch method and accepts these input keys: + - DOCUMENTS: (required) A list of LangChain Document objects + - KEYWORDS: (optional) A list of topic keywords + The chain must directly output either a string label or a list of strings. + If provided, llm and prompt are ignored. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -81,83 +75,77 @@ class LangChain(BaseRepresentation): * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` - chain_config: The configuration for the langchain chain. Can be used to set options - like max_concurrency to avoid rate limiting errors. + chain_config: The configuration for the LangChain chain. Can be used to set options like max_concurrency to avoid rate limiting errors. + Usage: - To use this, you will need to install the langchain package first. - Additionally, you will need an underlying LLM to support langchain, - like openai: + To use this representation, you will need to install the LangChain package first. - `pip install langchain` - `pip install openai` + `pip install langchain` - Then, you can create your chain as follows: + There are two ways to use the LangChain representation: - ```python - from langchain.chains.question_answering import load_qa_chain - from langchain.llms import OpenAI - chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") - ``` + 1. Use a default LangChain chain that is created using an underlying language model and a prompt. - Finally, you can pass the chain to BERTopic as follows: + You will first need to install the package for the underlying model. For example, if you want to use OpenAI: - ```python - from bertopic.representation import LangChain + `pip install langchain_openai` - # Create your representation model - representation_model = LangChain(chain) + ```python + from bertopic.representation import LangChain + from langchain_openai import ChatOpenAI - # Use the representation model in BERTopic on top of the default pipeline - topic_model = BERTopic(representation_model=representation_model) - ``` + chat_model = ChatOpenAI(temperature=0, openai_api_key=my_openai_api_key) - You can also use a custom prompt: + # Create your representation model with the pre-defined prompt + representation_model = LangChain(llm=chat_model) - ```python - prompt = "What are these documents about? Please give a single label." - representation_model = LangChain(chain, prompt=prompt) - ``` + # Create your representation model with a custom prompt + prompt = "Output a single label that describes the following documents: [DOCUMENTS]" + representation_model = LangChain(llm=chat_model, prompt=prompt) - You can also use a Runnable instead of a chain. - The example below uses the LangChain Expression Language: + # Use the representation model in BERTopic on top of the default pipeline + topic_model = BERTopic(representation_model=representation_model) + ``` - ```python - from bertopic.representation import LangChain - from langchain.chains.question_answering import load_qa_chain - from langchain.chat_models import ChatAnthropic - from langchain.schema.document import Document - from langchain.schema.runnable import RunnablePassthrough - from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer + 2. Use a custom LangChain chain for full control over the generation process: - prompt = ... - llm = ... + ```python + from bertopic.representation import LangChain + from langchain_anthropic import ChatAnthropic + from langchain_core.documents import Document + from langchain_core.prompts import ChatPromptTemplate + from langchain.chains.combine_documents import create_stuff_documents_chain + from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer - # We will construct a special privacy-preserving chain using Microsoft Presidio + prompt = ... - pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + chat_model = ... - chain = ( - { - "input_documents": ( - lambda inp: [ - Document( - page_content=pii_handler.anonymize( - d.page_content, - language="en", - ), - ) - for d in inp["input_documents"] - ] - ), - "question": RunnablePassthrough(), - } - | load_qa_chain(representation_llm, chain_type="stuff") - | (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])}) - ) + # We will construct a special privacy-preserving chain using Microsoft Presidio + + pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + + chain = ( + { + "DOCUMENTS": ( + lambda inp: [ + Document( + page_content=pii_handler.anonymize( + d.page_content, + language="en", + ), + ) + for d in inp["DOCUMENTS"] + ] + ), + "KEYWORDS": lambda keywords: keywords["KEYWORDS"], + } + | create_stuff_documents_chain(chat_model, prompt, document_variable_name="DOCUMENTS") + ) - representation_model = LangChain(chain, prompt=representation_prompt) - ``` + representation_model = LangChain(chain=chain) + ``` """ def __init__( From da10d1fb2c01acdaf7f7c4d851036de79f09902b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 20:54:13 +0100 Subject: [PATCH 10/19] Update the docstring for details on custom chain --- bertopic/representation/_langchain.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 9c2c85ef..53f95c01 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -39,7 +39,9 @@ class LangChain(BaseRepresentation): - """This representation model uses LangChain to generate descriptive topic labels. It supports two main usage patterns. + """This representation model uses LangChain to generate descriptive topic labels. + + It supports two main usage patterns: 1. Basic usage with a language model and optional custom prompt 2. Advanced usage with a custom LangChain chain for full control over the generation process @@ -87,7 +89,7 @@ class LangChain(BaseRepresentation): 1. Use a default LangChain chain that is created using an underlying language model and a prompt. - You will first need to install the package for the underlying model. For example, if you want to use OpenAI: + You will first need to install the package for the underlying model. For example, if you want to use OpenAI: `pip install langchain_openai` @@ -110,6 +112,9 @@ class LangChain(BaseRepresentation): 2. Use a custom LangChain chain for full control over the generation process: + Remember that the chain will receive two inputs: `DOCUMENTS` and `KEYWORDS` and that it must return directly a string label + or a list of strings. + ```python from bertopic.representation import LangChain from langchain_anthropic import ChatAnthropic From 038f73f95c29b949c40a63d4c4ca3d3ec8365f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 21:10:27 +0100 Subject: [PATCH 11/19] Add error handling for incorrect prompt. Add processing of keywords into input for the chain --- bertopic/representation/_langchain.py | 40 +++++++++++++++------------ 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 53f95c01..178b5dc9 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -103,7 +103,7 @@ class LangChain(BaseRepresentation): representation_model = LangChain(llm=chat_model) # Create your representation model with a custom prompt - prompt = "Output a single label that describes the following documents: [DOCUMENTS]" + prompt = "What are these documents about? [DOCUMENTS] Here are keywords related to them [KEYWORDS]." representation_model = LangChain(llm=chat_model, prompt=prompt) # Use the representation model in BERTopic on top of the default pipeline @@ -164,9 +164,15 @@ def __init__( tokenizer: Union[str, Callable] = None, chain_config: dict = None, ): + self.prompt = prompt + if chain is not None: self.chain = chain elif llm is not None: + # Check that the prompt contains the necessary placeholder + if "[DOCUMENTS]" not in prompt: + raise ValueError("The prompt must contain the placeholder [DOCUMENTS]") + # Convert prompt placeholders to the LangChain format langchain_prompt = prompt.replace("[DOCUMENTS]", "{DOCUMENTS}").replace("[KEYWORDS]", "{KEYWORDS}") @@ -221,24 +227,24 @@ def extract_topics( for docs in repr_docs_mappings.values() ] - # `self.chain` must take `input_documents` and `question` as input keys - # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS] - if "[KEYWORDS]" in self.prompt: - prompts = [] - for topic in topics: - keywords = list(zip(*topics[topic]))[0] - prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) - prompts.append(prompt) - - inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] - - else: - inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] + # Extract keywords from the topics and format them as a string + formatted_keywords_list = [] + for topic in topics: + keywords = list(zip(*topics[topic]))[0] + formatted_keywords_list.append(", ".join(keywords)) + + # self.chain must accept DOCUMENTS as a mandatory input key and KEYWORDS as an optional input key + # We always pass both keys to the chain, and the chain can choose to use them or not + # Documents are passed as a list of LangChain Document objects, it is up to the chain to format them into a string + inputs = [ + {"DOCUMENTS": docs, "KEYWORDS": formatted_keywords} + for docs, formatted_keywords in zip(chain_docs, formatted_keywords_list) + ] - # `self.chain` must return a dict with an `output_text` key - # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` + # self.chain must return a string label or a list of string labels for each input outputs = self.chain.batch(inputs=inputs, config=self.chain_config) - labels = [output["output_text"].strip() for output in outputs] + + labels = [output.strip() for output in outputs] updated_topics = { topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) From 90fcd53037140e3a356b8ddc9a3cb071bd1e397e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 21:15:53 +0100 Subject: [PATCH 12/19] Allow list outputs in representation --- bertopic/representation/_langchain.py | 32 ++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 178b5dc9..dc2962e9 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -244,10 +244,32 @@ def extract_topics( # self.chain must return a string label or a list of string labels for each input outputs = self.chain.batch(inputs=inputs, config=self.chain_config) - labels = [output.strip() for output in outputs] - - updated_topics = { - topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) - } + # Process outputs from the chain - can be either strings or lists of strings + updated_topics = {} + for topic, output in zip(repr_docs_mappings.keys(), outputs): + # Each output can be either: + # - A single string representing the main topic label + # - A list of strings representing multiple related labels + if isinstance(output, str): + # For string output: use it as the main label (weight=1) + # and pad with 9 empty strings (weight=0) + labels = [(output.strip(), 1)] + [("", 0) for _ in range(9)] + else: + # For list output: + # 1. Convert all elements to stripped strings + # 2. Take up to 10 elements + # 3. Assign decreasing weights from 1.0 to 0.1 + # 4. Pad with empty strings if needed to always have 10 elements + clean_outputs = [str(label).strip() for label in output] + top_labels = clean_outputs[:10] + + # Create (label, weight) pairs with decreasing weights + labels = [(label, 1.0 - (i * 0.1)) for i, label in enumerate(top_labels)] + + # Pad with empty strings if we have less than 10 labels + if len(labels) < 10: + labels.extend([("", 0.0) for _ in range(10 - len(labels))]) + + updated_topics[topic] = labels return updated_topics From 89f767c64094cf3a868bd4521f84190781c5f84e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 21:16:12 +0100 Subject: [PATCH 13/19] linting --- bertopic/representation/_langchain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index dc2962e9..acff851a 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -262,14 +262,14 @@ def extract_topics( # 4. Pad with empty strings if needed to always have 10 elements clean_outputs = [str(label).strip() for label in output] top_labels = clean_outputs[:10] - + # Create (label, weight) pairs with decreasing weights labels = [(label, 1.0 - (i * 0.1)) for i, label in enumerate(top_labels)] - + # Pad with empty strings if we have less than 10 labels if len(labels) < 10: labels.extend([("", 0.0) for _ in range(10 - len(labels))]) - + updated_topics[topic] = labels return updated_topics From f9b3c75707b912473b98e62fa0c5425d90af64cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Thu, 31 Oct 2024 22:27:15 +0100 Subject: [PATCH 14/19] Fix typos in prompt --- bertopic/representation/_langchain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index acff851a..74b8fcc9 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -10,12 +10,12 @@ from bertopic.representation._utils import truncate_document DEFAULT_PROMPT = """ -This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title +This is a list of texts where each collection of texts describes a topic. After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title. --- Topic: Sample texts from this topic: -- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. -- Meat, but especially beef, is the word food in terms of emissions. +- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food. +- Meat, but especially beef, is the worst food in terms of emissions. - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. Keywords: meat beef eat eating emissions steak food health processed chicken From d4159742498c35e292c4f32b15c98e97cf4ef3f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Fri, 8 Nov 2024 13:13:35 +0100 Subject: [PATCH 15/19] Fix formatting of examples in the prompt to match how the actual data is formatted --- bertopic/representation/_langchain.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 74b8fcc9..af9ab4e1 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -14,26 +14,27 @@ --- Topic: Sample texts from this topic: -- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food. -- Meat, but especially beef, is the worst food in terms of emissions. -- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. +Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food. +Meat, but especially beef, is the worst food in terms of emissions. +Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. -Keywords: meat beef eat eating emissions steak food health processed chicken +Keywords: meat, beef, eat, eating, emissions, steak, food, health, processed, chicken Topic name: Environmental impacts of eating meat --- Topic: Sample texts from this topic: -- I have ordered the product weeks ago but it still has not arrived! -- The website mentions that it only takes a couple of days to deliver but I still have not received mine. -- I got a message stating that I received the monitor but that is not true! -- It took a month longer to deliver than was advised... +I have ordered the product weeks ago but it still has not arrived! +The website mentions that it only takes a couple of days to deliver but I still have not received mine. +I got a message stating that I received the monitor but that is not true! +It took a month longer to deliver than was advised... -Keywords: deliver weeks product shipping long delivery received arrived arrive week +Keywords: deliver, weeks, product, shipping, long, delivery, received, arrived, arrive, week Topic name: Shipping and delivery issues --- Topic: Sample texts from this topic: [DOCUMENTS] + Keywords: [KEYWORDS] Topic name:""" @@ -180,7 +181,12 @@ def __init__( chat_prompt = ChatPromptTemplate.from_template(langchain_prompt) # Create a basic LangChain chain using create_stuff_documents_chain - self.chain = create_stuff_documents_chain(llm, chat_prompt, document_variable_name="DOCUMENTS") + self.chain = create_stuff_documents_chain( + llm, + chat_prompt, + document_variable_name="DOCUMENTS", + document_separator="\n", + ) else: raise ValueError("Either `llm` or `chain` must be provided") From 4f1fddc31f0a860686e90dcb3f5b456c7cc1309d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Fri, 8 Nov 2024 13:21:55 +0100 Subject: [PATCH 16/19] Fix newline issue --- bertopic/representation/_langchain.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index af9ab4e1..db78877a 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -9,8 +9,7 @@ from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document -DEFAULT_PROMPT = """ -This is a list of texts where each collection of texts describes a topic. After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title. +DEFAULT_PROMPT = """This is a list of texts where each collection of texts describes a topic. After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title. --- Topic: Sample texts from this topic: From 7a224813dc11624c1490a7dda95eef293cb92297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Fri, 8 Nov 2024 13:33:24 +0100 Subject: [PATCH 17/19] Use constant weight of 1 for all labels when the chain returns a list of labels --- bertopic/representation/_langchain.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index db78877a..cd84b23c 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -263,17 +263,17 @@ def extract_topics( # For list output: # 1. Convert all elements to stripped strings # 2. Take up to 10 elements - # 3. Assign decreasing weights from 1.0 to 0.1 + # 3. Assign weight of 1 to all elements # 4. Pad with empty strings if needed to always have 10 elements clean_outputs = [str(label).strip() for label in output] top_labels = clean_outputs[:10] - # Create (label, weight) pairs with decreasing weights - labels = [(label, 1.0 - (i * 0.1)) for i, label in enumerate(top_labels)] + # Create (label, weight) pairs with weight=1 + labels = [(label, 1) for label in top_labels] # Pad with empty strings if we have less than 10 labels if len(labels) < 10: - labels.extend([("", 0.0) for _ in range(10 - len(labels))]) + labels.extend([("", 0) for _ in range(10 - len(labels))]) updated_topics[topic] = labels From 4be8f566ff85395a35c491d2f1a42c2944bd36fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Fri, 8 Nov 2024 14:09:40 +0100 Subject: [PATCH 18/19] Add an example in the documentation of advanced usage with list output --- bertopic/representation/_langchain.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index cd84b23c..922f7f44 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -115,6 +115,7 @@ class LangChain(BaseRepresentation): Remember that the chain will receive two inputs: `DOCUMENTS` and `KEYWORDS` and that it must return directly a string label or a list of strings. + Example of a custom chain that uses Microsoft Presidio to anonymize the documents and returns a single label: ```python from bertopic.representation import LangChain from langchain_anthropic import ChatAnthropic @@ -151,6 +152,30 @@ class LangChain(BaseRepresentation): representation_model = LangChain(chain=chain) ``` + + Example of a custom chain that returns a list of labels: + ```python + from bertopic.representation import LangChain + from langchain_openai import ChatOpenAI + from langchain_core.prompts import ChatPromptTemplate + from langchain.chains.combine_documents import create_stuff_documents_chain + from langchain_core.output_parsers import CommaSeparatedListOutputParser + + chat_model = ... + + list_prompt = ChatPromptTemplate.from_template( + "Here is a list of documents: {DOCUMENTS}. Output a comma-separated list of keywords that represents these documents." + ) + list_chain = create_stuff_documents_chain( + llm=chat_model, + prompt=list_prompt, + document_variable_name="DOCUMENTS", + output_parser=CommaSeparatedListOutputParser() + ) + + representation_model = LangChain(chain=list_chain) + ``` + """ def __init__( From 56ea9fbd481ba0733f8b4bce6bbaeca17608273a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Tamines?= Date: Fri, 8 Nov 2024 14:19:16 +0100 Subject: [PATCH 19/19] Update docs --- docs/getting_started/representation/llm.md | 78 +++++++++++++++------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/docs/getting_started/representation/llm.md b/docs/getting_started/representation/llm.md index 2b35571d..59b230a4 100644 --- a/docs/getting_started/representation/llm.md +++ b/docs/getting_started/representation/llm.md @@ -468,56 +468,84 @@ representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True, prompt=s The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method! If you want to have multiple representations of a single topic, it might be worthwhile to also check out [**multi-aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling with BERTopic. - ## **LangChain** -[Langchain](https://github.com/hwchase17/langchain) is a package that helps users with chaining large language models. -In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this -external knowledge are the most representative documents in each topic. +[LangChain](https://github.com/hwchase17/langchain) can be used to generate descriptive topic labels in BERTopic. It supports both basic usage with language models and advanced usage with custom chains for full control over the generation process. -To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, -like openai: +To use LangChain, you will need to install it first, along with the specific integration for your chosen language model. For example, to use OpenAI models: ```bash pip install langchain -pip langchain_openai +pip install langchain-openai ``` -Then, you can create your chain as follows: +See the [LangChain integrations page](https://python.langchain.com/docs/integrations/chat/) for the full list of supported chat models and their required packages. + +There are two main ways to use LangChain with BERTopic: + +### **Basic Usage** + +The simplest way is to use a language model with an optional custom prompt: ```python +from bertopic.representation import LangChain from langchain_openai import ChatOpenAI -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.runnables import RunnablePassthrough -from langchain.chains.combine_documents import create_stuff_documents_chain -chat_model = ChatOpenAI(model=..., api_key=...) +# Create a chat model +chat_model = ChatOpenAI(temperature=0, openai_api_key="...") -prompt = ChatPromptTemplate.from_template("What are these documents about? {documents}. Please give a single label.") +# Create your representation model with the pre-defined prompt +representation_model = LangChain(llm=chat_model) -chain = RunnablePassthrough.assign(representation=create_stuff_documents_chain(chat_model, prompt, document_variable_name="documents")) +# Use the representation model in BERTopic +topic_model = BERTopic(representation_model=representation_model) ``` -Finally, you can pass the chain to BERTopic as follows: +You can also customize the prompt: ```python -from bertopic.representation import LangChain +prompt = "Here is a list of documents: [DOCUMENTS]. These documents are described by these keywords: [KEYWORDS]. Please give a short label." +representation_model = LangChain(llm=chat_model, prompt=prompt) +``` -# Create your representation model -representation_model = LangChain(chain) +### **Advanced Usage** -# Use the representation model in BERTopic on top of the default pipeline +For more control, you can create a custom LangChain chain to generate the representations. The representation of a topic can in that case be a single label or a list of labels, and must be directly returned by the chain. + +Here's an example using multiple labels output: + +```python +from langchain_core.prompts import ChatPromptTemplate +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain_core.output_parsers import CommaSeparatedListOutputParser + +# Multiple Labels Output +list_prompt = ChatPromptTemplate.from_template( + "Here is a list of documents: {DOCUMENTS}. These documents are described by these keywords: {KEYWORDS}. Output a comma-separated list of labels that represents these documents." +) +list_chain = create_stuff_documents_chain( + llm=chat_model, + prompt=list_prompt, + document_variable_name="DOCUMENTS", + output_parser=CommaSeparatedListOutputParser() +) + +# Use in BERTopic +representation_model = LangChain(chain=list_chain) topic_model = BERTopic(representation_model=representation_model) ``` -You can also customize the prompt, and include the optional `keywords` placeholder to add the keywords to the prompt. +!!! note + When creating custom chains, the prompt uses LangChain's syntax with curly braces: `{DOCUMENTS}` and `{KEYWORDS}` instead of BERTopic's `[DOCUMENTS]` and `[KEYWORDS]`. + +### **Chain Configuration** + +You can configure the chain with different parameters or add callbacks. For example, to handle rate limits when using external APIs, you can control the number of concurrent requests: ```python -prompt = ChatPromptTemplate.from_messages( - [ - ("system", "You are provided with a list of documents and are asked to provide a single label for the topic."), - ("human", "Here is the list of documents: {documents} and related keywords: {keywords}"), - ] +representation_model = LangChain( + chain=chain, + chain_config={"max_concurrency": 5} ) ```