-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* hamilton + Burr example added * specified beautifulsoup parser * added notebook badges; added files required by CI * added intro and conclusion --------- Co-authored-by: zilto <tjean@DESKTOP-V6JDCS2>
- Loading branch information
Showing
12 changed files
with
1,792 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Modular RAG with Burr and Hamilton | ||
|
||
This examples shows the "2-layer" approach to building RAG and LLM agents using Burr and Hamilton. | ||
|
||
You will find: | ||
|
||
- `notebook.ipynb` contains a guide on how to build a modular RAG application. It details how a typicaly project evolves and how Burr and Hamilton can help you achieve the desired modularity. | ||
- `application.py` and `actions/` contain the code from the final application version showed in the notebook. |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import lancedb | ||
import openai | ||
|
||
|
||
def relevant_chunks(user_query: str) -> list[dict]: | ||
chunks_table = lancedb.connect("./blogs").open_table("chunks") | ||
search_results = ( | ||
chunks_table.search(user_query).select(["text", "url", "position"]).limit(3).to_list() | ||
) | ||
return search_results | ||
|
||
|
||
def system_prompt(relevant_chunks: list[dict]) -> str: | ||
relevant_content = "\n".join([c["text"] for c in relevant_chunks]) | ||
return ( | ||
"Answer the user's questions based on the provided blog post content. " | ||
"Answer in a concise and helpful manner, and tell the user " | ||
"if you don't know the answer or you're unsure.\n\n" | ||
f"BLOG CONTENT:\n{relevant_content}" | ||
) | ||
|
||
|
||
def llm_answer(system_prompt: str, user_query: str) -> str: | ||
client = openai.OpenAI() | ||
response = client.chat.completions.create( | ||
model="gpt-4o-mini", | ||
messages=[ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": user_query}, | ||
], | ||
) | ||
return response.choices[0].message.content |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import re | ||
|
||
import lancedb | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from lancedb.embeddings import get_registry | ||
from lancedb.pydantic import LanceModel, Vector | ||
|
||
embedding_model = get_registry().get("openai").create() | ||
|
||
|
||
class TextDocument(LanceModel): | ||
"""Simple data structure to hold a piece of text associated with a url.""" | ||
|
||
url: str | ||
position: int | ||
text: str = embedding_model.SourceField() | ||
vector: Vector(dim=embedding_model.ndims()) = embedding_model.VectorField() | ||
|
||
|
||
def html_content(blog_post_url: str) -> str: | ||
return requests.get(blog_post_url).text | ||
|
||
|
||
def parsed_text(html_content: str) -> str: | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
return soup.get_text(separator=" ", strip=True) | ||
|
||
|
||
def sentences(parsed_text: str) -> list[str]: | ||
return [sentence.strip() for sentence in re.split(r"[.!?]+", parsed_text) if sentence.strip()] | ||
|
||
|
||
def overlapping_chunks( | ||
sentences: list[str], window: int = 5, stride: int = 3, min_window_size: int = 2 | ||
) -> list[str]: | ||
overlapping_chunks = [] | ||
n_chunks = len(sentences) | ||
for start_i in range(0, n_chunks, stride): | ||
if (start_i + window <= n_chunks) or (n_chunks - start_i >= min_window_size): | ||
overlapping_chunks.append( | ||
" ".join(sentences[start_i : min(start_i + window, n_chunks)]) | ||
) | ||
return overlapping_chunks | ||
|
||
|
||
def embed_chunks(overlapping_chunks: list[str], blog_post_url: str) -> dict: | ||
# embed and store the chunks using LanceDB | ||
con = lancedb.connect("./blogs") | ||
table = con.create_table("chunks", exist_ok=True, schema=TextDocument) | ||
table.add( | ||
[{"text": c, "url": blog_post_url, "position": i} for i, c in enumerate(overlapping_chunks)] | ||
) | ||
return {"n_chunks_embedded": len(overlapping_chunks)} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from hamilton.driver import Builder, Driver | ||
|
||
from burr.core import ApplicationBuilder, State, action | ||
|
||
|
||
@action(reads=[], writes=[]) | ||
def ingest_blog(state: State, blog_post_url: str, dr: Driver) -> State: | ||
"""Download a blog post and parse it""" | ||
dr.execute(["embed_chunks"], inputs={"blog_post_url": blog_post_url}) | ||
return state | ||
|
||
|
||
@action(reads=[], writes=["llm_answer"]) | ||
def ask_question(state: State, user_query: str, dr: Driver) -> State: | ||
"""Reply to the user's query using the blog's content.""" | ||
results = dr.execute(["llm_answer"], inputs={"user_query": user_query}) | ||
return state.update(llm_answer=results["llm_answer"]) | ||
|
||
|
||
if __name__ == "__main__": | ||
# renames to avoid name conflicts with the @action functions | ||
from actions import ask_question as ask_module | ||
from actions import ingest_blog as ingest_module | ||
from hamilton.plugins.h_opentelemetry import OpenTelemetryTracer | ||
from opentelemetry.instrumentation.lancedb import LanceInstrumentor | ||
from opentelemetry.instrumentation.openai import OpenAIInstrumentor | ||
|
||
OpenAIInstrumentor().instrument() | ||
LanceInstrumentor().instrument() | ||
|
||
dr = ( | ||
Builder() | ||
.with_modules(ingest_module, ask_module) | ||
.with_adapters(OpenTelemetryTracer()) | ||
.build() | ||
) | ||
|
||
app = ( | ||
ApplicationBuilder() | ||
.with_actions(ingest_blog.bind(dr=dr), ask_question.bind(dr=dr)) | ||
.with_transitions(("ingest_blog", "ask_question")) | ||
.with_entrypoint("ingest_blog") | ||
.with_tracker(project="modular-rag", use_otel_tracing=True) | ||
.build() | ||
) | ||
|
||
action_name, results, state = app.run( | ||
halt_after=["ask_question"], | ||
inputs={ | ||
"blog_post_url": "https://blog.dagworks.io/p/from-blog-to-bot-build-a-rag-app", | ||
"user_query": "What do you need to monitor in a RAG app?", | ||
}, | ||
) | ||
print(state["llm_answer"]) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.