From a696d3ef0ef5824a28f389c06f197ff56469be2a Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sun, 15 Dec 2024 16:08:12 +0100 Subject: [PATCH] LLM: Update to langchain-cratedb 0.0.0 After quite a bit of back and forth, and a slow genesis in general, this subsystem is finally approaching departures to take off. On the chrome/surface/interface, this update doesn't change much, just a few bits of "naming things". --- .../llm-langchain/conversational_memory.ipynb | 40 ++++++++----------- .../llm-langchain/conversational_memory.py | 12 +++--- .../cratedb-vectorstore-rag-openai-sql.ipynb | 6 +-- .../cratedb_rag_customer_support.ipynb | 17 +++----- ...atedb_rag_customer_support_langchain.ipynb | 10 ++--- ...ratedb_rag_customer_support_vertexai.ipynb | 18 +++------ .../llm-langchain/document_loader.ipynb | 2 +- .../llm-langchain/document_loader.py | 2 +- .../llm-langchain/requirements-dev.txt | 6 +-- .../llm-langchain/requirements.txt | 12 +----- .../llm-langchain/vector_search.ipynb | 18 ++++----- .../llm-langchain/vector_search.py | 4 +- 12 files changed, 58 insertions(+), 89 deletions(-) diff --git a/topic/machine-learning/llm-langchain/conversational_memory.ipynb b/topic/machine-learning/llm-langchain/conversational_memory.ipynb index fb9a9dd3..26b3c39e 100644 --- a/topic/machine-learning/llm-langchain/conversational_memory.ipynb +++ b/topic/machine-learning/llm-langchain/conversational_memory.ipynb @@ -59,18 +59,18 @@ "execution_count": 2, "outputs": [], "source": [ - "from langchain_community.chat_message_histories import CrateDBChatMessageHistory\n", + "from langchain_cratedb.chat_history import CrateDBChatMessageHistory\n", "\n", "# Connect to a self-managed CrateDB instance.\n", "CONNECTION_STRING = \"crate://crate@localhost/?schema=notebook\"\n", "\n", - "chat_message_history = CrateDBChatMessageHistory(\n", + "chat_history = CrateDBChatMessageHistory(\n", "\tsession_id=\"test_session\",\n", "\tconnection_string=CONNECTION_STRING\n", ")\n", "\n", "# Make sure to start with a blank canvas.\n", - "chat_message_history.clear()" + "chat_history.clear()" ], "metadata": { "collapsed": false @@ -90,8 +90,8 @@ "execution_count": 3, "outputs": [], "source": [ - "chat_message_history.add_user_message(\"Hello\")\n", - "chat_message_history.add_ai_message(\"Hi\")" + "chat_history.add_user_message(\"Hello\")\n", + "chat_history.add_ai_message(\"Hi\")" ], "metadata": { "collapsed": false, @@ -117,9 +117,7 @@ "output_type": "execute_result" } ], - "source": [ - "chat_message_history.messages" - ], + "source": "chat_history.messages", "metadata": { "collapsed": false, "ExecuteTime": { @@ -214,7 +212,7 @@ "\n", "\tBase.metadata.drop_all(bind=sa.create_engine(CONNECTION_STRING))\n", "\n", - "\tchat_message_history = CrateDBChatMessageHistory(\n", + "\tchat_history = CrateDBChatMessageHistory(\n", "\t\tsession_id=\"test_session\",\n", "\t\tconnection_string=CONNECTION_STRING,\n", "\t\tcustom_message_converter=CustomMessageConverter(\n", @@ -223,10 +221,10 @@ "\t)\n", "\n", "\t# Make sure to start with a blank canvas.\n", - "\tchat_message_history.clear()\n", + "\tchat_history.clear()\n", "\n", - "\tchat_message_history.add_user_message(\"Hello\")\n", - "\tchat_message_history.add_ai_message(\"Hi\")" + "\tchat_history.add_user_message(\"Hello\")\n", + "\tchat_history.add_ai_message(\"Hi\")" ], "metadata": { "collapsed": false, @@ -252,9 +250,7 @@ "output_type": "execute_result" } ], - "source": [ - "chat_message_history.messages" - ], + "source": "chat_history.messages", "metadata": { "collapsed": false, "ExecuteTime": { @@ -286,7 +282,7 @@ "import json\n", "import typing as t\n", "\n", - "from langchain_community.chat_message_histories.cratedb import CrateDBMessageConverter\n", + "from langchain_cratedb.chat_history import CrateDBMessageConverter\n", "from langchain.schema import _message_to_dict\n", "\n", "\n", @@ -312,7 +308,7 @@ "if __name__ == \"__main__\":\n", "\tBase.metadata.drop_all(bind=sa.create_engine(CONNECTION_STRING))\n", "\n", - "\tchat_message_history = CrateDBChatMessageHistory(\n", + "\tchat_history = CrateDBChatMessageHistory(\n", "\t\tsession_id=\"test_session\",\n", "\t\tconnection_string=CONNECTION_STRING,\n", "\t\tcustom_message_converter=CustomMessageConverterWithDifferentSessionIdColumn(),\n", @@ -320,10 +316,10 @@ "\t)\n", "\n", "\t# Make sure to start with a blank canvas.\n", - "\tchat_message_history.clear()\n", + "\tchat_history.clear()\n", "\n", - "\tchat_message_history.add_user_message(\"Hello\")\n", - "\tchat_message_history.add_ai_message(\"Hi\")" + "\tchat_history.add_user_message(\"Hello\")\n", + "\tchat_history.add_ai_message(\"Hi\")" ], "metadata": { "collapsed": false @@ -344,9 +340,7 @@ "output_type": "execute_result" } ], - "source": [ - "chat_message_history.messages" - ], + "source": "chat_history.messages", "metadata": { "collapsed": false } diff --git a/topic/machine-learning/llm-langchain/conversational_memory.py b/topic/machine-learning/llm-langchain/conversational_memory.py index f68de464..914bb467 100644 --- a/topic/machine-learning/llm-langchain/conversational_memory.py +++ b/topic/machine-learning/llm-langchain/conversational_memory.py @@ -16,7 +16,7 @@ import os from pprint import pprint -from langchain_community.chat_message_histories import CrateDBChatMessageHistory +from langchain_cratedb.chat_history import CrateDBChatMessageHistory CONNECTION_STRING = os.environ.get( @@ -27,13 +27,13 @@ def main(): - chat_message_history = CrateDBChatMessageHistory( + chat_history = CrateDBChatMessageHistory( session_id="test_session", - connection_string=CONNECTION_STRING, + connection=CONNECTION_STRING, ) - chat_message_history.add_user_message("Hello") - chat_message_history.add_ai_message("Hi") - pprint(chat_message_history.messages) + chat_history.add_user_message("Hello") + chat_history.add_ai_message("Hi") + pprint(chat_history.messages) if __name__ == "__main__": diff --git a/topic/machine-learning/llm-langchain/cratedb-vectorstore-rag-openai-sql.ipynb b/topic/machine-learning/llm-langchain/cratedb-vectorstore-rag-openai-sql.ipynb index a169a9d7..4ae933d6 100644 --- a/topic/machine-learning/llm-langchain/cratedb-vectorstore-rag-openai-sql.ipynb +++ b/topic/machine-learning/llm-langchain/cratedb-vectorstore-rag-openai-sql.ipynb @@ -75,13 +75,11 @@ "metadata": {}, "outputs": [], "source": [ - "import openai\n", "import pandas as pd\n", "import sqlalchemy as sa\n", "\n", "from langchain_community.document_loaders import PyPDFLoader\n", - "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", - "from langchain_openai import OpenAIEmbeddings" + "from langchain_text_splitters import RecursiveCharacterTextSplitter" ] }, { @@ -162,7 +160,7 @@ "# environment variables.\n", "import os\n", "\n", - "CONNECTION_STRING = CrateDBVectorSearch.connection_string_from_db_params(\n", + "CONNECTION_STRING = CrateDBVectorStore.connection_string_from_db_params(\n", " driver=os.environ.get(\"CRATEDB_DRIVER\", \"crate\"),\n", " host=os.environ.get(\"CRATEDB_HOST\", \"localhost\"),\n", " port=int(os.environ.get(\"CRATEDB_PORT\", \"4200\")),\n", diff --git a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb index 88776c32..29aa3cb5 100644 --- a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb +++ b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support.ipynb @@ -65,20 +65,15 @@ }, "outputs": [], "source": [ - "from langchain.chains import RetrievalQA, ConversationalRetrievalChain\n", - "from langchain_openai import ChatOpenAI, OpenAI, OpenAIEmbeddings\n", - "import pandas as pd\n", - "import sqlalchemy as sa\n", - "from sqlalchemy import create_engine\n", - "from sqlalchemy import text\n", - "import crate\n", "import openai\n", - "import os\n", "import requests\n", - "from pueblo.util.environ import getenvpass\n", + "import pandas as pd\n", + "from langchain.chains import RetrievalQA, ConversationalRetrievalChain\n", "from langchain_community.document_loaders import CSVLoader\n", - "from langchain_community.vectorstores import Chroma\n", - "from langchain_text_splitters import RecursiveCharacterTextSplitter" + "from langchain_openai import OpenAIEmbeddings\n", + "from pueblo.util.environ import getenvpass\n", + "from sqlalchemy import create_engine\n", + "from sqlalchemy import text" ] }, { diff --git a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_langchain.ipynb b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_langchain.ipynb index ef40db5a..967173e2 100644 --- a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_langchain.ipynb +++ b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_langchain.ipynb @@ -107,7 +107,7 @@ "from pueblo.util.environ import getenvpass\n", "from langchain_openai import OpenAIEmbeddings\n", "from langchain_community.document_loaders import CSVLoader\n", - "from langchain_community.vectorstores import CrateDBVectorSearch\n", + "from langchain_cratedb.vectorstores import CrateDBVectorStore\n", "\n", "warnings.filterwarnings('ignore')" ] @@ -301,11 +301,11 @@ "source": [ "embeddings = OpenAIEmbeddings()\n", "\n", - "store = CrateDBVectorSearch.from_documents(\n", + "store = CrateDBVectorStore.from_documents(\n", " embedding=embeddings,\n", " documents=data,\n", " collection_name=COLLECTION_NAME,\n", - " connection_string=CONNECTION_STRING,\n", + " connection=CONNECTION_STRING,\n", ")" ] }, @@ -519,11 +519,11 @@ "\n", "COLLECTION_NAME = \"customer_data_jina\"\n", "\n", - "store = CrateDBVectorSearch.from_documents(\n", + "store = CrateDBVectorStore.from_documents(\n", " embedding=embeddings,\n", " documents=data,\n", " collection_name=COLLECTION_NAME,\n", - " connection_string=CONNECTION_STRING,\n", + " connection=CONNECTION_STRING,\n", ")\n", "documents = return_documents(store, my_question)" ] diff --git a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_vertexai.ipynb b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_vertexai.ipynb index e2d775cc..11d5033d 100644 --- a/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_vertexai.ipynb +++ b/topic/machine-learning/llm-langchain/cratedb_rag_customer_support_vertexai.ipynb @@ -97,28 +97,20 @@ "outputs": [], "source": [ "import os\n", + "import re\n", "\n", - "import openai\n", "import pandas as pd\n", - "import warnings\n", "import requests\n", - "import re\n", - "from typing import Dict, List, Optional, Tuple, Union\n", - "\n", + "import warnings\n", "\n", - "from pueblo.util.environ import getenvpass\n", "from google.cloud import aiplatform\n", "from vertexai.generative_models import (\n", " GenerationConfig,\n", - " GenerationResponse,\n", " GenerativeModel,\n", - " HarmBlockThreshold,\n", - " HarmCategory,\n", ")\n", "from langchain_community.document_loaders import CSVLoader\n", "from langchain_community.embeddings import VertexAIEmbeddings\n", - "from langchain_community.llms import VertexAI\n", - "from langchain_community.vectorstores import CrateDBVectorSearch\n", + "from langchain_cratedb.vectorstores import CrateDBVectorStore\n", "\n", "warnings.filterwarnings('ignore')" ] @@ -347,11 +339,11 @@ "source": [ "embeddings = VertexAIEmbeddings(model_name=\"textembedding-gecko@001\")\n", "\n", - "store = CrateDBVectorSearch.from_documents(\n", + "store = CrateDBVectorStore.from_documents(\n", " embedding=embeddings,\n", " documents=data,\n", " collection_name=COLLECTION_NAME,\n", - " connection_string=CONNECTION_STRING,\n", + " connection=CONNECTION_STRING,\n", ")" ] }, diff --git a/topic/machine-learning/llm-langchain/document_loader.ipynb b/topic/machine-learning/llm-langchain/document_loader.ipynb index bab75e51..8d6d4c69 100644 --- a/topic/machine-learning/llm-langchain/document_loader.ipynb +++ b/topic/machine-learning/llm-langchain/document_loader.ipynb @@ -107,8 +107,8 @@ "outputs": [], "source": [ "import sqlalchemy as sa\n", - "from langchain_community.document_loaders import CrateDBLoader\n", "from langchain_community.utilities.sql_database import SQLDatabase\n", + "from langchain_cratedb.loaders import CrateDBLoader\n", "from pprint import pprint\n", "\n", "db = SQLDatabase(engine=sa.create_engine(CONNECTION_STRING))\n", diff --git a/topic/machine-learning/llm-langchain/document_loader.py b/topic/machine-learning/llm-langchain/document_loader.py index 7f656041..43e6a14c 100644 --- a/topic/machine-learning/llm-langchain/document_loader.py +++ b/topic/machine-learning/llm-langchain/document_loader.py @@ -29,8 +29,8 @@ import requests import sqlalchemy as sa from cratedb_toolkit.util import DatabaseAdapter -from langchain_community.document_loaders import CrateDBLoader from langchain_community.utilities.sql_database import SQLDatabase +from langchain_cratedb.loaders import CrateDBLoader from pprint import pprint diff --git a/topic/machine-learning/llm-langchain/requirements-dev.txt b/topic/machine-learning/llm-langchain/requirements-dev.txt index 7220a15c..d957bd32 100644 --- a/topic/machine-learning/llm-langchain/requirements-dev.txt +++ b/topic/machine-learning/llm-langchain/requirements-dev.txt @@ -1,11 +1,11 @@ -# Real. +# Production. cratedb-toolkit[io] pueblo[notebook,testing] -# Development. +# Staging. # cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main # pueblo[notebook,testing] @ git+https://github.com/pyveci/pueblo.git@main -# Workstation. +# Development. #--editable=/Users/amo/dev/crate/ecosystem/cratedb-retentions[io] #--editable=/Users/amo/dev/pyveci/sources/pueblo[testing] diff --git a/topic/machine-learning/llm-langchain/requirements.txt b/topic/machine-learning/llm-langchain/requirements.txt index e8dc5d0d..2173bfb3 100644 --- a/topic/machine-learning/llm-langchain/requirements.txt +++ b/topic/machine-learning/llm-langchain/requirements.txt @@ -1,22 +1,12 @@ -# Real. crash -crate>=1.0.0.dev2 google-cloud-aiplatform<2 +langchain-cratedb<0.0.1 langchain-google-vertexai<3 langchain-openai<0.3 langchain-text-splitters<0.4 pueblo[cli,nlp]>=0.0.10 -pydantic>=2,<3 pypdf<6 python-dotenv<2 requests<3 requests-cache<2 -sqlalchemy==2.* -sqlalchemy-cratedb>=0.40.0 unstructured<0.17 - -# Development. -# cratedb-toolkit @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main -langchain @ git+https://github.com/crate-workbench/langchain.git@cratedb#subdirectory=libs/langchain -langchain-community @ git+https://github.com/crate-workbench/langchain.git@cratedb#subdirectory=libs/community -# pueblo[cli,fileio,nlp] @ git+https://github.com/pyveci/pueblo.git@main diff --git a/topic/machine-learning/llm-langchain/vector_search.ipynb b/topic/machine-learning/llm-langchain/vector_search.ipynb index f596d10b..80d4efb9 100644 --- a/topic/machine-learning/llm-langchain/vector_search.ipynb +++ b/topic/machine-learning/llm-langchain/vector_search.ipynb @@ -142,7 +142,7 @@ "# environment variables.\n", "import os\n", "\n", - "CONNECTION_STRING = CrateDBVectorSearch.connection_string_from_db_params(\n", + "CONNECTION_STRING = CrateDBVectorStore.connection_string_from_db_params(\n", " driver=os.environ.get(\"CRATEDB_DRIVER\", \"crate\"),\n", " host=os.environ.get(\"CRATEDB_HOST\", \"localhost\"),\n", " port=int(os.environ.get(\"CRATEDB_PORT\", \"4200\")),\n", @@ -166,8 +166,8 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_community.vectorstores import CrateDBVectorSearch\n", "from langchain_core.documents import Document\n", + "from langchain_cratedb.vectorstores import CrateDBVectorStore\n", "from langchain_openai import OpenAIEmbeddings" ] }, @@ -223,11 +223,11 @@ "source": [ "embeddings = OpenAIEmbeddings()\n", "\n", - "store = CrateDBVectorSearch.from_documents(\n", + "store = CrateDBVectorStore.from_documents(\n", " embedding=embeddings,\n", " documents=docs,\n", " collection_name=COLLECTION_NAME,\n", - " connection_string=CONNECTION_STRING,\n", + " connection=CONNECTION_STRING,\n", ")" ] }, @@ -334,10 +334,10 @@ }, "outputs": [], "source": [ - "store = CrateDBVectorSearch(\n", + "store = CrateDBVectorStore(\n", " collection_name=COLLECTION_NAME,\n", - " connection_string=CONNECTION_STRING,\n", - " embedding_function=embeddings,\n", + " connection=CONNECTION_STRING,\n", + " embeddings=embeddings,\n", ")" ] }, @@ -426,11 +426,11 @@ }, "outputs": [], "source": [ - "store = CrateDBVectorSearch.from_documents(\n", + "store = CrateDBVectorStore.from_documents(\n", " documents=docs,\n", " embedding=embeddings,\n", " collection_name=COLLECTION_NAME,\n", - " connection_string=CONNECTION_STRING,\n", + " connection=CONNECTION_STRING,\n", " pre_delete_collection=True,\n", ")" ] diff --git a/topic/machine-learning/llm-langchain/vector_search.py b/topic/machine-learning/llm-langchain/vector_search.py index 6672e242..fcc9bb41 100644 --- a/topic/machine-learning/llm-langchain/vector_search.py +++ b/topic/machine-learning/llm-langchain/vector_search.py @@ -20,7 +20,7 @@ python vector_search.py """ # noqa: E501 -from langchain_community.vectorstores import CrateDBVectorSearch +from langchain_cratedb.vectorstores import CrateDBVectorStore from langchain_openai import OpenAIEmbeddings import nltk @@ -37,7 +37,7 @@ def main(): documents = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0) # Embed each chunk, and load them into the vector store. - db = CrateDBVectorSearch.from_documents(documents, OpenAIEmbeddings()) + db = CrateDBVectorStore.from_documents(documents, OpenAIEmbeddings(), connection="crate://") # Invoke a query, and display the first result. query = "What did the president say about Ketanji Brown Jackson"