plastic-labs · hyusap · Sep 10, 2023 · Sep 10, 2023 · Sep 11, 2023 · Sep 11, 2023
diff --git a/.env.template b/.env.template
@@ -9,6 +9,10 @@ SUPABASE_KEY=
 MEMORY_TABLE=
 CONVERSATION_TABLE=
 
+SERPER_API_KEY=
+# Set true if reranking should be enabled, follow instructions here to install: https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/baai_general_embedding/README.md
+USE_RERANKER=
+
 # FastAPI
 URL=
 

diff --git a/Dockerfile b/Dockerfile
@@ -13,6 +13,8 @@ ENV PYTHONFAULTHANDLER=1 \
   PIP_DEFAULT_TIMEOUT=100 \
   POETRY_VERSION=1.4.1
 
+RUN apt-get update && apt-get install -y build-essential
+
 RUN pip install "poetry==$POETRY_VERSION"
 
 # Copy only requirements to cache them in docker layer

diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 # tutor-gpt
 
-Tutor-GPT is a LangChain LLM application. It dynamically reasons about your learning needs and *updates its own prompts* to best serve you.  
+Tutor-GPT is a LangChain LLM application. It dynamically reasons about your learning needs and _updates its own prompts_ to best serve you.
 
-We leaned into theory of mind experiments and Bloom is now more than just a literacy tutor, it’s an expansive learning companion. Read more about how it works [here](https://plasticlabs.ai/blog/Theory-of-Mind-is-All-You-Need) or you can join our [Discord](https://discord.gg/bloombotai) to try out our implementation for free (while our OpenAI spend lasts 😄).  
+We leaned into theory of mind experiments and Bloom is now more than just a literacy tutor, it’s an expansive learning companion. Read more about how it works [here](https://plasticlabs.ai/blog/Theory-of-Mind-is-All-You-Need) or you can join our [Discord](https://discord.gg/bloombotai) to try out our implementation for free (while our OpenAI spend lasts 😄).
 
-Alternatively, you can run your own instance of the bot by following the instructions below.  
+Alternatively, you can run your own instance of the bot by following the instructions below.
 
 ## Installation
 
@@ -16,15 +16,16 @@ This app requires you to have a few different environment variables set. Create
 
 **OPENAI_API_KEY**: Go to [OpenAI](https://beta.openai.com/account/api-keys) to generate your own API key.  
 **BOT_TOKEN**: This is the discord bot token. You can find instructions on how to create a bot and generate a token in the [pycord docs](https://guide.pycord.dev/getting-started/creating-your-first-bot).  
-**THOUGHT_CHANNEL_ID**: This is the discord channel for the bot to output thoughts to. Make a channel in your server and copy the ID by right clicking the channel and copying the link. The channel ID is the last string of numbers in the link.  
+**THOUGHT_CHANNEL_ID**: This is the discord channel for the bot to output thoughts to. Make a channel in your server and copy the ID by right clicking the channel and copying the link. The channel ID is the last string of numbers in the link.
+**SERPER_API_KEY**: This is the API Key for the google search API. You can get one at [Serper.dev](https://serper.dev/)
 
 ### Docker/Containerization
 
 The repository containers a `Dockerfile` for running the bot in a containerized workflow. Use the following command to build and run the container locally:
 
 ```bash
 docker build -t tutor-gpt:latest .
-docker run --env-file .env tutor-gpt 
+docker run --env-file .env tutor-gpt
 ```
 
 The current behaviour will utilize the `.env` file in your local repository and
@@ -40,7 +41,7 @@ docker run -p 8501:8501 --env-file .env tutor-gpt python -u -m streamlit run www
 ### Architecture
 
 Below is high level diagram of the architecture for the bot.
-![Tutor-GPT Discord Architecture](assets/ToM&#32;Chain&#32;Flow.png)
+![Tutor-GPT Discord Architecture](<assets/ToM Chain Flow.png>)
 
 ## Contributing
 

diff --git a/agent/chain.py b/agent/chain.py
@@ -1,16 +1,22 @@
 import os
-from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
+from collections.abc import AsyncIterator
+
+import sentry_sdk
+from dotenv import load_dotenv
+from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
+from langchain.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
+from langchain.llms import OpenAI, AzureOpenAI
+from langchain.output_parsers import CommaSeparatedListOutputParser
 from langchain.prompts import (
+    ChatPromptTemplate,
     SystemMessagePromptTemplate,
+    load_prompt,
 )
-from langchain.prompts import load_prompt, ChatPromptTemplate
-from langchain.schema import AIMessage, HumanMessage, BaseMessage
-from dotenv import load_dotenv
+from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
 
-from collections.abc import AsyncIterator
-from .cache import Conversation
+from agent.tools.search import SearchTool, search_ready_output_parser
 
-import sentry_sdk
+from .cache import Conversation
 
 load_dotenv()
 
@@ -25,13 +31,34 @@ class BloomChain:
     llm: AzureChatOpenAI | ChatOpenAI
     if (os.environ.get("OPENAI_API_TYPE") == "azure"):
         llm = AzureChatOpenAI(deployment_name = os.environ['OPENAI_API_DEPLOYMENT_NAME'], temperature=1.2, model_kwargs={"top_p": 0.5})
+        tool_llm = AzureOpenAI(deployment_name = os.environ['OPENAI_API_TOOL_DEPLOYMENT_NAME'], temperature=0.3, top_p=0.5)
     else:
         llm = ChatOpenAI(model_name = "gpt-4", temperature=1.2, model_kwargs={"top_p": 0.5})
+        tool_llm = OpenAI(model_name = "gpt-3.5-turbo-instruct", temperature=0.3, top_p=0.5)
 
     system_thought: SystemMessagePromptTemplate = SystemMessagePromptTemplate(prompt=SYSTEM_THOUGHT)
     system_response: SystemMessagePromptTemplate = SystemMessagePromptTemplate(prompt=SYSTEM_RESPONSE)
     system_user_prediction_thought: SystemMessagePromptTemplate = SystemMessagePromptTemplate(prompt=SYSTEM_USER_PREDICTION_THOUGHT)
 
+
+    search_tool: SearchTool
+    # Load Embeddings for search
+    # model_name = "BAAI/bge-small-en-v1.5"
+    # model_kwargs = {'device': 'cpu'}
+    # encode_kwargs = {'normalize_embeddings': False}
+    # embeddings = HuggingFaceBgeEmbeddings(
+    #     model_name=model_name,
+    #     model_kwargs=model_kwargs,
+    #     encode_kwargs=encode_kwargs
+    # )
+    embeddings = OpenAIEmbeddings(
+                    deployment=os.environ["OPENAI_API_EMBEDDING_DEPLOYMENT_NAME"],
+                    model="text-embedding-ada-002",
+                    openai_api_base=os.environ["OPENAI_API_BASE"],
+                    openai_api_type=os.environ["OPENAI_API_TYPE"],
+                )
+    search_tool = SearchTool.from_llm(llm=tool_llm, embeddings=embeddings)
+
     def __init__(self) -> None:
         pass
     # def __init__(self, llm: AzureChatOpenAI = AzureChatOpenAI(deployment_name = "vineeth-gpt35-16k-230828", temperature=1.2), verbose: bool = True) -> None:
@@ -44,8 +71,9 @@ def __init__(self) -> None:
 
     @classmethod
     @sentry_sdk.trace
-    def think(cls, cache: Conversation, input: str):
+    async def think(cls, cache: Conversation, input: str):
         """Generate Bloom's thought on the user."""
+
         # load message history
         thought_prompt = ChatPromptTemplate.from_messages([
             cls.system_thought,
@@ -57,28 +85,56 @@ def think(cls, cache: Conversation, input: str):
         cache.add_message("thought", HumanMessage(content=input))
 
         return Streamable(
-                chain.astream({}, {"tags": ["thought"], "metadata": {"conversation_id": cache.conversation_id, "user_id": cache.user_id}}),
+            chain.astream({}, {"tags": ["thought"], "metadata": {"conversation_id": cache.conversation_id, "user_id": cache.user_id}}),
             lambda thought: cache.add_message("thought", AIMessage(content=thought))
         )
-
+           
     @classmethod
     @sentry_sdk.trace
-    def respond(cls, cache: Conversation, thought: str, input: str):
+    async def respond(cls, cache: Conversation, thought: str, input: str):
         """Generate Bloom's response to the user."""
+
         response_prompt = ChatPromptTemplate.from_messages([
             cls.system_response,
             *cache.messages("response"),
             HumanMessage(content=input)
         ])
+
+        # apply search step
+        response_prompt = await cls.search_step(response_prompt.format_messages(thought=thought))
+
         chain = response_prompt | cls.llm
 
         cache.add_message("response", HumanMessage(content=input))
 
         return Streamable(
-            chain.astream({ "thought": thought }, {"tags": ["response"], "metadata": {"conversation_id": cache.conversation_id, "user_id": cache.user_id}}),
+            chain.astream({"thought": thought}, {"tags": ["response"], "metadata": {"conversation_id": cache.conversation_id, "user_id": cache.user_id}}),
             lambda response: cache.add_message("response", AIMessage(content=response))
         )
 
+
+    @classmethod
+    @sentry_sdk.trace
+    async def search_step(cls, messages: list[BaseMessage]):
+        search_messages = messages.copy()
+        search_messages.append(SystemMessage(content=f"Reason about whether or not a google search would be benificial to answer the question. Always use it if you are unsure about your knowledge.\n\nf{search_ready_output_parser.get_format_instructions()}"))
+
+        search_ready_message = cls.llm.predict_messages(search_messages)
+        search_ready = search_ready_output_parser.parse(search_ready_message.content)
+
+        if search_ready["Search"].lower() == "true":
+            search_messages.append(search_ready_message)
+            search_messages.append(SystemMessage(content=f"Now generate a google query that would be best to find information to answer the question."))
+
+            search_query_message = cls.llm.predict_messages(search_messages)
+            # search_result_summary = cls.search_tool.run(search_query_message.content)
+            search_result_summary = await cls.search_tool.arun(search_query_message.content)
+
+            messages.append(SystemMessage(content=f"Use the information from these searchs to help answer your question.\nMake sure to not just repeat answers from sources, provide the sources justifications when possible. More detail is better.\n\nRelevant Google Search: {search_query_message.content}\n\n{search_result_summary}\n\nCite your sources via bracket notation with numbers (don't use any other special characters like \"^\", only use \"[\" and \"]\"), and include the full links at the end."))
+
+        return ChatPromptTemplate.from_messages(messages)
+
+
     @classmethod
     @sentry_sdk.trace
     async def think_user_prediction(cls, cache: Conversation):
@@ -103,10 +159,10 @@ async def think_user_prediction(cls, cache: Conversation):
     @classmethod    
     @sentry_sdk.trace
     async def chat(cls, cache: Conversation, inp: str ) -> tuple[str, str]:
-        thought_iterator = cls.think(cache, inp)
+        thought_iterator = await cls.think(cache, inp)
         thought = await thought_iterator()
 
-        response_iterator = cls.respond(cache, thought, inp)
+        response_iterator = await cls.respond(cache, thought, inp)
         response = await response_iterator()
 
         await cls.think_user_prediction(cache)

diff --git a/agent/prompts/response.yaml b/agent/prompts/response.yaml
@@ -5,4 +5,4 @@ template: >
 
   {thought}
 
-  You must produce an appropriate response to the user input. Format equations in LaTeX and wrap in dollar signs like this: $\LaTeX$. Use markdown code syntax. Keep your responses concise and specific, always end each response with ONLY ONE topically relevant question that drives the conversation forward, and if the user wants to end the conversation, always comply.
+  You must produce an appropriate response to the user input. Use markdown code and $\LaTeX$ syntax. Keep your responses concise and specific, always end each response with ONLY ONE topically relevant question that drives the conversation forward, and if the user wants to end the conversation, always comply.
diff --git a/agent/tools/search.py b/agent/tools/search.py
@@ -0,0 +1,146 @@
+import asyncio
+import logging
+import os
+from typing import Optional, Type
+
+from dotenv import load_dotenv
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForToolRun,
+    CallbackManagerForToolRun,
+)
+from langchain.chains import LLMChain
+from langchain.docstore.document import Document
+from langchain.document_loaders import AsyncHtmlLoader
+from langchain.document_transformers import Html2TextTransformer
+from langchain.embeddings.base import Embeddings
+from langchain.llms.base import BaseLLM
+from langchain.output_parsers import ResponseSchema, StructuredOutputParser
+from langchain.prompts import Prompt
+from langchain.text_splitter import TokenTextSplitter
+from langchain.tools.base import BaseTool
+from langchain.utilities import GoogleSerperAPIWrapper
+from langchain.vectorstores import FAISS
+
+logger = logging.getLogger(__name__)
+load_dotenv() # Load environment variables
+
+
+# import nest_asyncio
+# nest_asyncio.apply() # https://github.com/erdewit/nest_asyncio
+
+# TODO: Store search results for entire conversation in vector store
+# TODO: Add answerbox to search results when available
+
+class SearchTool(BaseTool):
+    name: str = "search"
+    description: str = "useful for when you need to search for something on the internet"
+    llm: BaseLLM
+    embeddings: Embeddings
+    search: GoogleSerperAPIWrapper
+
+    @classmethod
+    def from_llm(cls, llm: BaseLLM, embeddings: Embeddings):
+        """Return a tool from a chat model."""
+        search = GoogleSerperAPIWrapper()
+        search.k = 3
+
+        if os.environ.get("USE_RERANKER") == "true":
+            from FlagEmbedding import FlagReranker
+            model = 'BAAI/bge-reranker-base'
+
+            cls.reranker = FlagReranker(model)
+            logger.info(f"Loaded reranker \"{model}\" for webpage search")
+
+        return cls(llm=llm, embeddings=embeddings, search=search)
+
+    def _run(
+        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
+    ) -> str:
+        """Use the tool."""
+        return asyncio.run(self._arun(query=query))
+
+    async def _arun(
+        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
+    ) -> str:
+        """Use the tool asynchronously."""
+
+        # remove quotes from query if present
+        if query[0] == '"' and query[-1] == '"':
+            query = query[1:-1]
+
+        results = await self.search.aresults(query=query)
+        organic_results = results["organic"]
+
+        summaries = await asyncio.gather(*[self._aresearch_url(result["link"], query) for result in organic_results])
+        relevant_results = [
+            {
+                "title": result["title"],
+                "snippet": result["snippet"],
+                "link": result["link"],
+                "summary": summary,
+            } for result, summary in zip(organic_results, summaries)
+        ]
+
+        formatted_results = [
+            f"{result['title']} - {result['link']}\nSnippet: {result['snippet']}\nPage Summary: {result['summary']}" for result in relevant_results
+        ]
+        formatted_results = "Search Results:\n" + "\n----------------------\n\n".join(formatted_results)
+
+        return formatted_results
+
+    async def _aresearch_url(self, url: str, query: str):
+        """Research a URL by embedding the web page and then using the most relevant sections to the query to generate a summary of the most important information on the page."""
+
+        prompt = Prompt.from_template("Your job is to summarize the information on the web page AS IT PERTAINS TO THE QUERY. You will be given a few selected sections of the web page to base your answer off of. \n\nQuestion: {query}\n\nBEGIN SELECTIONS\n{doc}\nEND SELECTIONS")
+        llm_chain = LLMChain(llm=self.llm, prompt=prompt)
+
+        try:
+            # Load HTML
+            loader = AsyncHtmlLoader([url])
+            html2text = Html2TextTransformer()
+            text_splitter = TokenTextSplitter(chunk_size=300, chunk_overlap=0)
+
+            html = loader.load()
+            docs = html2text.transform_documents(html)
+            docs = text_splitter.split_documents(docs)
+
+            # embedding search
+            db = FAISS.from_documents(docs, self.embeddings)
+            # query prefix is used per instructions https://github.com/FlagOpen/FlagEmbedding
+            relevant_sections = await db.asimilarity_search(query=("Represent this sentence for searching relevant passages: " + query), k=12)
+
+            # rerank
+            if hasattr(self, "reranker"):
+                scores = self.reranker.compute_score([[query, section.page_content] for section in relevant_sections])
+                # if there's only section, scores is a single score, not a list
+                if isinstance(scores, float):
+                    scores = [scores]
+
+                scores_with_index = zip(scores, range(len(scores)))
+                scores_with_index = sorted(scores_with_index, key=lambda x: x[0], reverse=True)
+                relevant_sections = [relevant_sections[index] for _score, index in scores_with_index]
+
+                logger.info("Reranked webpage sections, different from original order: " + str([index for _score, index in scores_with_index]) + " Chunk count: " + str(len(docs)))
+
+            # format sections together to be used as input to the LLM
+            relevant_sections = "\n".join([f'"{section.page_content}"' for section in relevant_sections[:3]])
+
+            # summarize the relevant sections
+            summary = await llm_chain.arun({"query": query, "doc": relevant_sections})
+            return summary
+        except Exception as e:
+            logger.error("Error loading HTML:", e)
+            return f"Error loading HTML: {e}"
+
+
+search_generation_schemas = [
+    ResponseSchema(name="Reasoning", description="Reasoning behind what google query would be best to find information to answer the question"),
+    ResponseSchema(name="Search Query", description="The google query that would be best to find information to answer the question. DO NOT USE ANY QUOTES OR OTHER SPECIAL CHARACTERS ANYWHERE."),
+]
+search_generation_output_parser = StructuredOutputParser.from_response_schemas(search_generation_schemas)
+
+search_ready_schemas = [
+    ResponseSchema(name="Reasoning", description="Reasoning behind whether or not a google search would be necessary to effectively answer the question."),
+    ResponseSchema(name="Search", description="<true/false> whether or not a google search should be used to find information to answer the question."),
+]
+search_ready_output_parser = StructuredOutputParser.from_response_schemas(search_ready_schemas)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,4 +5,4 @@ template: >

		{thought}

		You must produce an appropriate response to the user input. Format equations in LaTeX and wrap in dollar signs like this: $\LaTeX$. Use markdown code syntax. Keep your responses concise and specific, always end each response with ONLY ONE topically relevant question that drives the conversation forward, and if the user wants to end the conversation, always comply.
		You must produce an appropriate response to the user input. Use markdown code and $\LaTeX$ syntax. Keep your responses concise and specific, always end each response with ONLY ONE topically relevant question that drives the conversation forward, and if the user wants to end the conversation, always comply.