diff --git a/topic/machine-learning/llm-langchain/document_loader.ipynb b/topic/machine-learning/llm-langchain/document_loader.ipynb index 3067a7b0..bab75e51 100644 --- a/topic/machine-learning/llm-langchain/document_loader.ipynb +++ b/topic/machine-learning/llm-langchain/document_loader.ipynb @@ -159,8 +159,8 @@ "loader = CrateDBLoader(\n", " 'SELECT * FROM mlb_teams_2012 ORDER BY \"Team\" LIMIT 5;',\n", " db=db,\n", - " page_content_columns=[\"Team\"],\n", - " metadata_columns=[\"Payroll (millions)\"],\n", + " page_content_mapper=lambda row: row[\"Team\"],\n", + " metadata_mapper=lambda row: {\"Payroll (millions)\": row[\"Payroll (millions)\"]},\n", ")\n", "documents = loader.load()" ] diff --git a/topic/machine-learning/llm-langchain/requirements-dev.txt b/topic/machine-learning/llm-langchain/requirements-dev.txt index a6b83092..7220a15c 100644 --- a/topic/machine-learning/llm-langchain/requirements-dev.txt +++ b/topic/machine-learning/llm-langchain/requirements-dev.txt @@ -1,6 +1,6 @@ # Real. cratedb-toolkit[io] -pueblo[notebook,testing]==0.0.9 +pueblo[notebook,testing] # Development. # cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main diff --git a/topic/machine-learning/llm-langchain/requirements.txt b/topic/machine-learning/llm-langchain/requirements.txt index 5da76636..2725deb1 100644 --- a/topic/machine-learning/llm-langchain/requirements.txt +++ b/topic/machine-learning/llm-langchain/requirements.txt @@ -4,7 +4,7 @@ google-cloud-aiplatform<2 langchain-google-vertexai<3 langchain-openai<0.3 langchain-text-splitters<0.4 -pueblo[cli,nlp]==0.0.9 +pueblo[cli,nlp] @ git+https://github.com/pyveci/pueblo.git@main pydantic>=2,<3 pypdf<6 python-dotenv<2 diff --git a/topic/machine-learning/llm-langchain/vector_search.py b/topic/machine-learning/llm-langchain/vector_search.py index 8a37c015..6672e242 100644 --- a/topic/machine-learning/llm-langchain/vector_search.py +++ b/topic/machine-learning/llm-langchain/vector_search.py @@ -19,12 +19,12 @@ # Run program. python vector_search.py """ # noqa: E501 -from langchain_community.document_loaders import UnstructuredURLLoader + from langchain_community.vectorstores import CrateDBVectorSearch -from langchain_text_splitters import CharacterTextSplitter from langchain_openai import OpenAIEmbeddings import nltk +from pueblo.nlp.resource import CachedWebResource def main(): @@ -32,12 +32,11 @@ def main(): nltk.download("averaged_perceptron_tagger_eng") nltk.download("punkt_tab") - # Load the document, split it into chunks, embed each chunk, - # and load it into the vector store. - state_of_the_union_url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt" - raw_documents = UnstructuredURLLoader(urls=[state_of_the_union_url]).load() - text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) - documents = text_splitter.split_documents(raw_documents) + # Load a document, and split it into chunks. + url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt" + documents = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0) + + # Embed each chunk, and load them into the vector store. db = CrateDBVectorSearch.from_documents(documents, OpenAIEmbeddings()) # Invoke a query, and display the first result.