-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathutils.py
39 lines (32 loc) · 1.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from glob import glob
from tqdm import tqdm
import yaml
def load_config():
with open('config.yaml', 'r') as file:
config = yaml.safe_load(file)
return config
config = load_config()
def load_embeddings(model_name=config["embeddings"]["name"],
model_kwargs = {'device': config["embeddings"]["device"]}):
return HuggingFaceEmbeddings(model_name=model_name, model_kwargs = model_kwargs)
def load_documents(directory : str):
"""Loads all documents from a directory and returns a list of Document objects
args: directory format = directory/
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size = config["TextSplitter"]["chunk_size"],
chunk_overlap = config["TextSplitter"]["chunk_overlap"])
documents = []
for item_path in tqdm(glob(directory + "*.pdf")):
loader = PyPDFLoader(item_path)
documents.extend(loader.load_and_split(text_splitter=text_splitter))
return documents
def load_db(embedding_function, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]):
db = FAISS.load_local(folder_path=save_path, index_name=index_name, embeddings = embedding_function)
return db
def save_db(db, save_path=config["faiss_indexstore"]["save_path"], index_name=config["faiss_indexstore"]["index_name"]):
db.save_local(save_path, index_name)
print("Saved db to " + save_path + index_name)