-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessor.py
74 lines (62 loc) · 2.55 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import re
from textwrap import dedent
from bs4 import BeautifulSoup
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from config import llm, embeddings_model
class PreProcess:
def __init__(self):
self.llm = llm
self.embeddings_model = embeddings_model
# Preprocess the document and store embeddings
def store_embeddings(self, file_path):
try:
loader = PDFMinerPDFasHTMLLoader(file_path)
data = loader.load()[0]
soup = BeautifulSoup(data.page_content, "html.parser")
content = soup.find_all("div")
cur_fs = None
cur_text = ""
snippets = [] # first collect all snippets that have the same font size
for c in content:
sp = c.find("span")
if not sp:
continue
st = sp.get("style")
if not st:
continue
fs = re.findall("font-size:(\d+)px", st)
if not fs:
continue
fs = int(fs[0])
if not cur_fs:
cur_fs = fs
if fs == cur_fs:
cur_text += c.text
else:
snippets.append(cur_text)
cur_fs = fs
cur_text = c.text
snippets.append(cur_text)
# Note: The above logic is very straightforward can be found on the docs.
# # print(snippets)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.create_documents(snippets)
# print(all_splits)
vectorstore = FAISS.from_documents(all_splits, self.embeddings_model)
logger.info("Document embeddings stored successfully.")
print("vectorstore.index.ntotal : ", vectorstore.index.ntotal)
return vectorstore
except Exception as e:
logger.error(f"Error preprocessing document: {e}")
raise
# Query the vector store for relevant chunks
def query_vector_store(self, query, vector_store, top_k=5):
embedding = self.embeddings_model.embed_query(query)
return vector_store.similarity_search_by_vector(embedding, top_k)