Skip to content

Commit

Permalink
ML/LangChain: Fix notebooks by following upstream changes
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Nov 3, 2024
1 parent 9801b61 commit 0fa4c55
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 12 deletions.
4 changes: 2 additions & 2 deletions topic/machine-learning/llm-langchain/document_loader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@
"loader = CrateDBLoader(\n",
" 'SELECT * FROM mlb_teams_2012 ORDER BY \"Team\" LIMIT 5;',\n",
" db=db,\n",
" page_content_columns=[\"Team\"],\n",
" metadata_columns=[\"Payroll (millions)\"],\n",
" page_content_mapper=lambda row: row[\"Team\"],\n",
" metadata_mapper=lambda row: {\"Payroll (millions)\": row[\"Payroll (millions)\"]},\n",
")\n",
"documents = loader.load()"
]
Expand Down
2 changes: 1 addition & 1 deletion topic/machine-learning/llm-langchain/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Real.
cratedb-toolkit[io]
pueblo[notebook,testing]==0.0.9
pueblo[notebook,testing]

# Development.
# cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main
Expand Down
2 changes: 1 addition & 1 deletion topic/machine-learning/llm-langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ google-cloud-aiplatform<2
langchain-google-vertexai<3
langchain-openai<0.3
langchain-text-splitters<0.4
pueblo[cli,nlp]==0.0.9
pueblo[cli,nlp] @ git+https://github.com/pyveci/pueblo.git@main
pydantic>=2,<3
pypdf<6
python-dotenv<2
Expand Down
15 changes: 7 additions & 8 deletions topic/machine-learning/llm-langchain/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,24 @@
# Run program.
python vector_search.py
""" # noqa: E501
from langchain_community.document_loaders import UnstructuredURLLoader

from langchain_community.vectorstores import CrateDBVectorSearch
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

import nltk
from pueblo.nlp.resource import CachedWebResource


def main():

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("punkt_tab")

# Load the document, split it into chunks, embed each chunk,
# and load it into the vector store.
state_of_the_union_url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
raw_documents = UnstructuredURLLoader(urls=[state_of_the_union_url]).load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
# Load a document, and split it into chunks.
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
documents = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0)

# Embed each chunk, and load them into the vector store.
db = CrateDBVectorSearch.from_documents(documents, OpenAIEmbeddings())

# Invoke a query, and display the first result.
Expand Down

0 comments on commit 0fa4c55

Please sign in to comment.