-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
94 lines (65 loc) · 2.45 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from langchain.chains.summarize import load_summarize_chain
from astrapy.db import AstraDB
import os
from langchain.chat_models import ChatOpenAI
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)
def init_astra_db():
db = AstraDB(
namespace="neji",
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
print(f"Connected to Astra DB: {db.get_collections()}")
def load_any_document(file):
name, extension = os.path.splitext(file)
if extension == ".pdf":
from langchain.document_loaders import PyPDFLoader
print(f"loading file {file}")
loader = PyPDFLoader(file)
elif extension == ".docx":
from langchain.document_loaders import Docx2txtLoader
print(f"loading file {file}")
loader = Docx2txtLoader(file)
elif extension == ".txt":
from langchain.document_loaders import TextLoader
print(f"loading file {file}")
loader = TextLoader(file)
else:
print("Non supported document format")
return None
data = loader.load()
return data
def chunk_data(doc, chunk_size=256, chunk_overlap=10, metadata={}):
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n", " \n"]
)
docs = text_splitter.split_documents(doc)
for doc in docs:
doc.metadata = metadata
return docs
#ITERATE OVER FILES THAT USER UPLOADED
def create_docs(user_pdf_list, unique_id):
output_docs = []
file_names = []
for document in user_pdf_list:
bytes_data = document.read()
file_names.append(document.name)
file_name = os.path.join("./uploads", document.name)
with open(file_name, "wb") as f:
f.write(bytes_data)
docs = load_any_document(file_name)
chunks = chunk_data(doc=docs,
chunk_size = 1000,
chunk_overlap = 200,
metadata={"name": document.name,
"type": document.type,
"size": document.size,
"unique_id": unique_id},)
output_docs.extend(chunks)
return output_docs
def get_summary(current_doc):
llm = ChatOpenAI()
chain = load_summarize_chain(llm=llm, chain_type="map_reduce")
summary = chain.run([current_doc])
return summary