diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index df5c4839..922f7f44 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -1,35 +1,61 @@ import pandas as pd -from langchain.docstore.document import Document +from langchain_core.documents import Document from scipy.sparse import csr_matrix from typing import Callable, Mapping, List, Tuple, Union - +from langchain_core.language_models import LanguageModelLike +from langchain_core.runnables import Runnable +from langchain_core.prompts import ChatPromptTemplate +from langchain.chains.combine_documents import create_stuff_documents_chain from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document -DEFAULT_PROMPT = "What are these documents about? Please give a single label." +DEFAULT_PROMPT = """This is a list of texts where each collection of texts describes a topic. After each collection of texts, the name of the topic they represent is mentioned as a short, highly descriptive title. +--- +Topic: +Sample texts from this topic: +Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial-style meat production and factory farming, meat has become a staple food. +Meat, but especially beef, is the worst food in terms of emissions. +Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. + +Keywords: meat, beef, eat, eating, emissions, steak, food, health, processed, chicken +Topic name: Environmental impacts of eating meat +--- +Topic: +Sample texts from this topic: +I have ordered the product weeks ago but it still has not arrived! +The website mentions that it only takes a couple of days to deliver but I still have not received mine. +I got a message stating that I received the monitor but that is not true! +It took a month longer to deliver than was advised... + +Keywords: deliver, weeks, product, shipping, long, delivery, received, arrived, arrive, week +Topic name: Shipping and delivery issues +--- +Topic: +Sample texts from this topic: +[DOCUMENTS] + +Keywords: [KEYWORDS] +Topic name:""" class LangChain(BaseRepresentation): - """Using chains in langchain to generate topic labels. - - The classic example uses `langchain.chains.question_answering.load_qa_chain`. - This returns a chain that takes a list of documents and a question as input. + """This representation model uses LangChain to generate descriptive topic labels. - You can also use Runnables such as those composed using the LangChain Expression Language. + It supports two main usage patterns: + 1. Basic usage with a language model and optional custom prompt + 2. Advanced usage with a custom LangChain chain for full control over the generation process Arguments: - chain: The langchain chain or Runnable with a `batch` method. - Input keys must be `input_documents` and `question`. - Output key must be `output_text`. - prompt: The prompt to be used in the model. If no prompt is given, - `self.default_prompt_` is used instead. - NOTE: Use `"[KEYWORDS]"` in the prompt - to decide where the keywords need to be - inserted. Keywords won't be included unless - indicated. Unlike other representation models, - Langchain does not use the `"[DOCUMENTS]"` tag - to insert documents into the prompt. The load_qa_chain function - formats the representative documents within the prompt. + llm: A LangChain text model or chat model used to generate representations, only needed for basic usage. + Examples include ChatOpenAI or ChatAnthropic. Ignored if a custom chain is provided. + prompt: A string template containing the placeholder [DOCUMENTS] and optionally [KEYWORDS], only needed for basic usage. + Defaults to a pre-defined prompt defined in DEFAULT_PROMPT. Ignored if a custom chain is provided. + chain: A custom LangChain chain to generate representations, only needed for advanced usage. + The chain must be a LangChain Runnable that implements the batch method and accepts these input keys: + - DOCUMENTS: (required) A list of LangChain Document objects + - KEYWORDS: (optional) A list of topic keywords + The chain must directly output either a string label or a list of strings. + If provided, llm and prompt are ignored. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher @@ -51,103 +77,148 @@ class LangChain(BaseRepresentation): * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` - chain_config: The configuration for the langchain chain. Can be used to set options - like max_concurrency to avoid rate limiting errors. + chain_config: The configuration for the LangChain chain. Can be used to set options like max_concurrency to avoid rate limiting errors. + Usage: - To use this, you will need to install the langchain package first. - Additionally, you will need an underlying LLM to support langchain, - like openai: - - `pip install langchain` - `pip install openai` - - Then, you can create your chain as follows: - - ```python - from langchain.chains.question_answering import load_qa_chain - from langchain.llms import OpenAI - chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") - ``` - - Finally, you can pass the chain to BERTopic as follows: - - ```python - from bertopic.representation import LangChain - - # Create your representation model - representation_model = LangChain(chain) - - # Use the representation model in BERTopic on top of the default pipeline - topic_model = BERTopic(representation_model=representation_model) - ``` - - You can also use a custom prompt: - - ```python - prompt = "What are these documents about? Please give a single label." - representation_model = LangChain(chain, prompt=prompt) - ``` - - You can also use a Runnable instead of a chain. - The example below uses the LangChain Expression Language: - - ```python - from bertopic.representation import LangChain - from langchain.chains.question_answering import load_qa_chain - from langchain.chat_models import ChatAnthropic - from langchain.schema.document import Document - from langchain.schema.runnable import RunnablePassthrough - from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer - - prompt = ... - llm = ... - - # We will construct a special privacy-preserving chain using Microsoft Presidio - - pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) - - chain = ( - { - "input_documents": ( - lambda inp: [ - Document( - page_content=pii_handler.anonymize( - d.page_content, - language="en", - ), - ) - for d in inp["input_documents"] - ] - ), - "question": RunnablePassthrough(), - } - | load_qa_chain(representation_llm, chain_type="stuff") - | (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])}) - ) - - representation_model = LangChain(chain, prompt=representation_prompt) - ``` + To use this representation, you will need to install the LangChain package first. + + `pip install langchain` + + There are two ways to use the LangChain representation: + + 1. Use a default LangChain chain that is created using an underlying language model and a prompt. + + You will first need to install the package for the underlying model. For example, if you want to use OpenAI: + + `pip install langchain_openai` + + ```python + from bertopic.representation import LangChain + from langchain_openai import ChatOpenAI + + chat_model = ChatOpenAI(temperature=0, openai_api_key=my_openai_api_key) + + # Create your representation model with the pre-defined prompt + representation_model = LangChain(llm=chat_model) + + # Create your representation model with a custom prompt + prompt = "What are these documents about? [DOCUMENTS] Here are keywords related to them [KEYWORDS]." + representation_model = LangChain(llm=chat_model, prompt=prompt) + + # Use the representation model in BERTopic on top of the default pipeline + topic_model = BERTopic(representation_model=representation_model) + ``` + + 2. Use a custom LangChain chain for full control over the generation process: + + Remember that the chain will receive two inputs: `DOCUMENTS` and `KEYWORDS` and that it must return directly a string label + or a list of strings. + + Example of a custom chain that uses Microsoft Presidio to anonymize the documents and returns a single label: + ```python + from bertopic.representation import LangChain + from langchain_anthropic import ChatAnthropic + from langchain_core.documents import Document + from langchain_core.prompts import ChatPromptTemplate + from langchain.chains.combine_documents import create_stuff_documents_chain + from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer + + prompt = ... + + chat_model = ... + + # We will construct a special privacy-preserving chain using Microsoft Presidio + + pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + + chain = ( + { + "DOCUMENTS": ( + lambda inp: [ + Document( + page_content=pii_handler.anonymize( + d.page_content, + language="en", + ), + ) + for d in inp["DOCUMENTS"] + ] + ), + "KEYWORDS": lambda keywords: keywords["KEYWORDS"], + } + | create_stuff_documents_chain(chat_model, prompt, document_variable_name="DOCUMENTS") + ) + + representation_model = LangChain(chain=chain) + ``` + + Example of a custom chain that returns a list of labels: + ```python + from bertopic.representation import LangChain + from langchain_openai import ChatOpenAI + from langchain_core.prompts import ChatPromptTemplate + from langchain.chains.combine_documents import create_stuff_documents_chain + from langchain_core.output_parsers import CommaSeparatedListOutputParser + + chat_model = ... + + list_prompt = ChatPromptTemplate.from_template( + "Here is a list of documents: {DOCUMENTS}. Output a comma-separated list of keywords that represents these documents." + ) + list_chain = create_stuff_documents_chain( + llm=chat_model, + prompt=list_prompt, + document_variable_name="DOCUMENTS", + output_parser=CommaSeparatedListOutputParser() + ) + + representation_model = LangChain(chain=list_chain) + ``` + """ def __init__( self, - chain, - prompt: str = None, + llm: LanguageModelLike = None, + prompt: str = DEFAULT_PROMPT, + chain: Runnable = None, nr_docs: int = 4, diversity: float = None, doc_length: int = None, tokenizer: Union[str, Callable] = None, - chain_config=None, + chain_config: dict = None, ): - self.chain = chain - self.prompt = prompt if prompt is not None else DEFAULT_PROMPT - self.default_prompt_ = DEFAULT_PROMPT - self.chain_config = chain_config + self.prompt = prompt + + if chain is not None: + self.chain = chain + elif llm is not None: + # Check that the prompt contains the necessary placeholder + if "[DOCUMENTS]" not in prompt: + raise ValueError("The prompt must contain the placeholder [DOCUMENTS]") + + # Convert prompt placeholders to the LangChain format + langchain_prompt = prompt.replace("[DOCUMENTS]", "{DOCUMENTS}").replace("[KEYWORDS]", "{KEYWORDS}") + + # Create ChatPromptTemplate + chat_prompt = ChatPromptTemplate.from_template(langchain_prompt) + + # Create a basic LangChain chain using create_stuff_documents_chain + self.chain = create_stuff_documents_chain( + llm, + chat_prompt, + document_variable_name="DOCUMENTS", + document_separator="\n", + ) + else: + raise ValueError("Either `llm` or `chain` must be provided") + self.nr_docs = nr_docs self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + self.chain_config = chain_config def extract_topics( self, @@ -186,27 +257,49 @@ def extract_topics( for docs in repr_docs_mappings.values() ] - # `self.chain` must take `input_documents` and `question` as input keys - # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS] - if "[KEYWORDS]" in self.prompt: - prompts = [] - for topic in topics: - keywords = list(zip(*topics[topic]))[0] - prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) - prompts.append(prompt) - - inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] - - else: - inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] + # Extract keywords from the topics and format them as a string + formatted_keywords_list = [] + for topic in topics: + keywords = list(zip(*topics[topic]))[0] + formatted_keywords_list.append(", ".join(keywords)) + + # self.chain must accept DOCUMENTS as a mandatory input key and KEYWORDS as an optional input key + # We always pass both keys to the chain, and the chain can choose to use them or not + # Documents are passed as a list of LangChain Document objects, it is up to the chain to format them into a string + inputs = [ + {"DOCUMENTS": docs, "KEYWORDS": formatted_keywords} + for docs, formatted_keywords in zip(chain_docs, formatted_keywords_list) + ] - # `self.chain` must return a dict with an `output_text` key - # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` + # self.chain must return a string label or a list of string labels for each input outputs = self.chain.batch(inputs=inputs, config=self.chain_config) - labels = [output["output_text"].strip() for output in outputs] - updated_topics = { - topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) - } + # Process outputs from the chain - can be either strings or lists of strings + updated_topics = {} + for topic, output in zip(repr_docs_mappings.keys(), outputs): + # Each output can be either: + # - A single string representing the main topic label + # - A list of strings representing multiple related labels + if isinstance(output, str): + # For string output: use it as the main label (weight=1) + # and pad with 9 empty strings (weight=0) + labels = [(output.strip(), 1)] + [("", 0) for _ in range(9)] + else: + # For list output: + # 1. Convert all elements to stripped strings + # 2. Take up to 10 elements + # 3. Assign weight of 1 to all elements + # 4. Pad with empty strings if needed to always have 10 elements + clean_outputs = [str(label).strip() for label in output] + top_labels = clean_outputs[:10] + + # Create (label, weight) pairs with weight=1 + labels = [(label, 1) for label in top_labels] + + # Pad with empty strings if we have less than 10 labels + if len(labels) < 10: + labels.extend([("", 0) for _ in range(10 - len(labels))]) + + updated_topics[topic] = labels return updated_topics diff --git a/docs/getting_started/representation/llm.md b/docs/getting_started/representation/llm.md index ab538bd3..59b230a4 100644 --- a/docs/getting_started/representation/llm.md +++ b/docs/getting_started/representation/llm.md @@ -468,50 +468,86 @@ representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True, prompt=s The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method! If you want to have multiple representations of a single topic, it might be worthwhile to also check out [**multi-aspect**](https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html) topic modeling with BERTopic. - ## **LangChain** -[Langchain](https://github.com/hwchase17/langchain) is a package that helps users with chaining large language models. -In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this -external knowledge are the most representative documents in each topic. +[LangChain](https://github.com/hwchase17/langchain) can be used to generate descriptive topic labels in BERTopic. It supports both basic usage with language models and advanced usage with custom chains for full control over the generation process. -To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, -like openai: +To use LangChain, you will need to install it first, along with the specific integration for your chosen language model. For example, to use OpenAI models: ```bash -pip install langchain, openai +pip install langchain +pip install langchain-openai ``` -Then, you can create your chain as follows: +See the [LangChain integrations page](https://python.langchain.com/docs/integrations/chat/) for the full list of supported chat models and their required packages. -```python -from langchain.chains.question_answering import load_qa_chain -from langchain.llms import OpenAI -chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") -``` +There are two main ways to use LangChain with BERTopic: + +### **Basic Usage** -Finally, you can pass the chain to BERTopic as follows: +The simplest way is to use a language model with an optional custom prompt: ```python from bertopic.representation import LangChain +from langchain_openai import ChatOpenAI -# Create your representation model -representation_model = LangChain(chain) +# Create a chat model +chat_model = ChatOpenAI(temperature=0, openai_api_key="...") -# Use the representation model in BERTopic on top of the default pipeline +# Create your representation model with the pre-defined prompt +representation_model = LangChain(llm=chat_model) + +# Use the representation model in BERTopic topic_model = BERTopic(representation_model=representation_model) ``` -You can also use a custom prompt: +You can also customize the prompt: ```python -prompt = "What are these documents about? Please give a single label." -representation_model = LangChain(chain, prompt=prompt) +prompt = "Here is a list of documents: [DOCUMENTS]. These documents are described by these keywords: [KEYWORDS]. Please give a short label." +representation_model = LangChain(llm=chat_model, prompt=prompt) ``` -!!! note Note - The prompt does not make use of `[KEYWORDS]` and `[DOCUMENTS]` tags as - the documents are already used within langchain's `load_qa_chain`. +### **Advanced Usage** + +For more control, you can create a custom LangChain chain to generate the representations. The representation of a topic can in that case be a single label or a list of labels, and must be directly returned by the chain. + +Here's an example using multiple labels output: + +```python +from langchain_core.prompts import ChatPromptTemplate +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain_core.output_parsers import CommaSeparatedListOutputParser + +# Multiple Labels Output +list_prompt = ChatPromptTemplate.from_template( + "Here is a list of documents: {DOCUMENTS}. These documents are described by these keywords: {KEYWORDS}. Output a comma-separated list of labels that represents these documents." +) +list_chain = create_stuff_documents_chain( + llm=chat_model, + prompt=list_prompt, + document_variable_name="DOCUMENTS", + output_parser=CommaSeparatedListOutputParser() +) + +# Use in BERTopic +representation_model = LangChain(chain=list_chain) +topic_model = BERTopic(representation_model=representation_model) +``` + +!!! note + When creating custom chains, the prompt uses LangChain's syntax with curly braces: `{DOCUMENTS}` and `{KEYWORDS}` instead of BERTopic's `[DOCUMENTS]` and `[KEYWORDS]`. + +### **Chain Configuration** + +You can configure the chain with different parameters or add callbacks. For example, to handle rate limits when using external APIs, you can control the number of concurrent requests: + +```python +representation_model = LangChain( + chain=chain, + chain_config={"max_concurrency": 5} +) +``` ## **Cohere**