forked from kaarthik108/snowChat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest.py
58 lines (46 loc) · 1.73 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import SupabaseVectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader
from supabase.client import Client, create_client
from typing import Any, Dict
from pydantic import BaseModel
class Secrets(BaseModel):
SUPABASE_URL: str
SUPABASE_SERVICE_KEY: str
OPENAI_API_KEY: str
class Config(BaseModel):
chunk_size: int = 1000
chunk_overlap: int = 0
docs_dir: str = "docs/"
docs_glob: str = "**/*.md"
class DocumentProcessor:
def __init__(self, secrets: Secrets, config: Config):
self.client: Client = create_client(
secrets.SUPABASE_URL, secrets.SUPABASE_SERVICE_KEY
)
self.loader = DirectoryLoader(config.docs_dir, glob=config.docs_glob)
self.text_splitter = CharacterTextSplitter(
chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap
)
self.embeddings = OpenAIEmbeddings(openai_api_key=secrets.OPENAI_API_KEY)
def process(self) -> Dict[str, Any]:
data = self.loader.load()
texts = self.text_splitter.split_documents(data)
vector_store = SupabaseVectorStore.from_documents(
texts, self.embeddings, client=self.client
)
return vector_store
def run():
secrets = Secrets(
SUPABASE_URL=st.secrets["SUPABASE_URL"],
SUPABASE_SERVICE_KEY=st.secrets["SUPABASE_SERVICE_KEY"],
OPENAI_API_KEY=st.secrets["OPENAI_API_KEY"],
)
config = Config()
doc_processor = DocumentProcessor(secrets, config)
result = doc_processor.process()
return result
if __name__ == "__main__":
run()