From ca67687ce2645d052e63e1bbdb8fa1cddbcb4e1b Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 20:37:56 -0600 Subject: [PATCH 1/6] Create embeddings with OpenAI --- app.py | 65 +++++++++++++++++++++++++++++++++------- requirements.txt | 2 +- wk_flow_requirements.txt | 2 +- 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/app.py b/app.py index 0e8a5c9..cce7025 100644 --- a/app.py +++ b/app.py @@ -1,15 +1,36 @@ """ A simple example of Streamlit. """ from datetime import datetime as Date +import textwrap +import tiktoken import chromadb +from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction import fitz import streamlit as st +import openai +import os +from dotenv import load_dotenv +from openai import OpenAI +load_dotenv() + +if os.getenv("OPENAI_API_KEY") is None: + st.error("Please set OPENAI_API_KEY environment variable") + st.stop() +else: + openai.api_key = os.getenv("OPENAI_API_KEY") + +client = OpenAI() +embedding_function = OpenAIEmbeddingFunction( + api_key=openai.api_key, model_name="text-embedding-ada-002" +) # from openai import OpenAI chroma_client = chromadb.PersistentClient(path="tmp/chroma") chroma_client.heartbeat() -collection = chroma_client.get_or_create_collection("pdf-explainer") +collection = chroma_client.get_or_create_collection( + name="pdf-explainer", embedding_function=embedding_function +) # Query ChromaDb query = st.text_input("Query ChromaDb", value="", placeholder="Enter query") @@ -25,29 +46,51 @@ + "..." + "**Source:** " + results["metadatas"][0][idx]["source"] + + " **Tokens:** " + + str(results["metadatas"][0][idx]["num_tokens"]) ) pdf = st.file_uploader("Upload a file", type="pdf") - -if st.button("Save"): - if pdf is not None: - with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document - text = chr(12).join([page.get_text() for page in doc]) - st.write(text[0:200]) +if pdf is not None: + with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document + text = chr(12).join([page.get_text() for page in doc]) + st.write(text[0:200]) + if st.button("Add to collection"): collection.add( documents=[text], metadatas=[{"source": pdf.name}], ids=[pdf.name + str(Date.now())], ) - else: - st.write("Please upload a file of type: pdf") + if st.button("Save chunks"): + with st.spinner("Saving chunks..."): + chunks = textwrap.wrap(text, 24000) + for idx, chunk in enumerate(chunks): + encoding = tiktoken.get_encoding("cl100k_base") + num_tokens = len(encoding.encode(chunk)) + response = ( + client.embeddings.create( + input=chunk, model="text-embedding-ada-002" + ) + .data[0] + .embedding + ) + collection.add( + embeddings=[response], + documents=[chunk], + metadatas=[{"source": pdf.name, "num_tokens": num_tokens}], + ids=[pdf.name + str(idx)], + ) +else: + st.write("Please upload a file of type: pdf") if st.button("Chroma data collection"): st.write(collection) if st.button("Delete Chroma Collection"): - chroma_client.delete_collection(collection.name) - st.write("Deleted Chroma Collection") + try: + chroma_client.delete_collection(collection.name) + except AttributeError: + st.error("Collection erased.") diff --git a/requirements.txt b/requirements.txt index bcbff57..aae53b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,6 @@ tiktoken langchain pymupdf pypdf -chromadb +chromadb>='0.4.18' sentence_transformers streamlit \ No newline at end of file diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index 2389c28..be29b87 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -1,4 +1,4 @@ streamlit pymupdf pylint -chromadb \ No newline at end of file +chromadb>='0.4.18' From 04e2ec81d86c0398c001f7bb7d530a0e8bbda49e Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 20:57:29 -0600 Subject: [PATCH 2/6] Solve pylint complains --- app.py | 12 +++++++++++- wk_flow_requirements.txt | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/app.py b/app.py index cce7025..36775d0 100644 --- a/app.py +++ b/app.py @@ -1,15 +1,16 @@ """ A simple example of Streamlit. """ from datetime import datetime as Date import textwrap +import os import tiktoken import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction import fitz import streamlit as st import openai -import os from dotenv import load_dotenv from openai import OpenAI +from langchain.vectorstores import Chroma load_dotenv() @@ -94,3 +95,12 @@ chroma_client.delete_collection(collection.name) except AttributeError: st.error("Collection erased.") + +if chroma_client.get_collection(collection.name) is not None: + langchain_agent = Chroma(client=chroma_client, + collection_name=collection.name, + embedding_function=embedding_function + ) + + + diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index be29b87..fd84690 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -1,4 +1,6 @@ streamlit pymupdf +openai +tiktoken pylint chromadb>='0.4.18' From 9ae39bd22530de1d5e920df649b466595be1d353 Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 21:13:12 -0600 Subject: [PATCH 3/6] Solve pylint complains --- app.py | 7 ------- wk_flow_requirements.txt | 3 ++- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/app.py b/app.py index 36775d0..16bc39b 100644 --- a/app.py +++ b/app.py @@ -10,7 +10,6 @@ import openai from dotenv import load_dotenv from openai import OpenAI -from langchain.vectorstores import Chroma load_dotenv() @@ -95,12 +94,6 @@ chroma_client.delete_collection(collection.name) except AttributeError: st.error("Collection erased.") - -if chroma_client.get_collection(collection.name) is not None: - langchain_agent = Chroma(client=chroma_client, - collection_name=collection.name, - embedding_function=embedding_function - ) diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index fd84690..b54a825 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -2,5 +2,6 @@ streamlit pymupdf openai tiktoken -pylint +pylint} +langchain chromadb>='0.4.18' From e308ccaf90fae30821a42cae77944b1e709d5a4a Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 21:14:32 -0600 Subject: [PATCH 4/6] Solve pylint complains --- wk_flow_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wk_flow_requirements.txt b/wk_flow_requirements.txt index b54a825..8633cae 100644 --- a/wk_flow_requirements.txt +++ b/wk_flow_requirements.txt @@ -2,6 +2,6 @@ streamlit pymupdf openai tiktoken -pylint} +pylint langchain chromadb>='0.4.18' From bdfb18a2cb440e31a5a2628c482be2d8f06313bb Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 21:30:16 -0600 Subject: [PATCH 5/6] Solve pylint complains --- app.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/app.py b/app.py index 16bc39b..0170fff 100644 --- a/app.py +++ b/app.py @@ -93,7 +93,4 @@ try: chroma_client.delete_collection(collection.name) except AttributeError: - st.error("Collection erased.") - - - + st.error("Collection erased.") \ No newline at end of file From 53b330a90a963c7ff087efb5f2cd91739ac2ce90 Mon Sep 17 00:00:00 2001 From: Juan Perez Tejada Date: Fri, 8 Dec 2023 21:32:26 -0600 Subject: [PATCH 6/6] Solve pylint complains --- app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app.py b/app.py index 0170fff..3acaf5d 100644 --- a/app.py +++ b/app.py @@ -93,4 +93,5 @@ try: chroma_client.delete_collection(collection.name) except AttributeError: - st.error("Collection erased.") \ No newline at end of file + st.error("Collection erased.") + \ No newline at end of file