-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathembedding.py
88 lines (76 loc) · 2.94 KB
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import getpass
import glob
from dotenv import load_dotenv
from tqdm import tqdm
import pickle
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
load_dotenv(verbose=True)
document_dir = os.environ['DOCUMENT_DIR']
vectorstore_dir = os.environ['VECTOR_DB_DIR']
embeddings_model = os.environ['MODEL_EMBEDDINGS']
### WORKAROUND for "trust_remote_code=True is required" error in HuggingFaceEmbeddings()
from transformers import AutoModel
model = AutoModel.from_pretrained(embeddings_model, trust_remote_code=True)
##
## Read source documents and extract return the list contains documents
##
def generate_vectorstore_docs_from_data_source(
glob_pattern:str,
chunk_size:int=32,
chunk_overlap:int=0,
max_doc_count:int=-1):
doc_count = 0
data_files = glob.glob(glob_pattern, recursive=True)
documents = []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
separators=[
"\n\n",
"\n",
" ",
"\uff0e", # Fullwidth full stop
"\u3000",
"\u3002", # Ideographic full stop
"",
],
)
print('*** Begin to split documents')
for data_file in tqdm(data_files):
# with open(data_file, 'rt', encoding='utf-8') as f:
with open(data_file) as f:
state_of_the_union = f.read()
docs = text_splitter.create_documents([state_of_the_union])
for doc in docs:
documents.append(doc)
print('*** Complete to split documents')
hf_bge_embeddings = HuggingFaceEmbeddings(
model_name=embeddings_model,
model_kwargs = {'device':'cpu'},
encode_kwargs = {'normalize_embeddings': True},
)
vectorstore = Chroma(
persist_directory=vectorstore_dir,
embedding_function=hf_bge_embeddings
)
print('*** Begin to add documents into vectorDB')
for doc in tqdm(documents):
vectorstore.add_documents([doc])
print('*** Complete to add documents into vectorDB')
# Generate documents from data source. Read the documents from pickle file if exists.
pickle_file = './doc_obj.pickle'
if not os.path.exists(pickle_file):
print('*** Reading Original Data and generating document(s) and Converting documents into embeddings and creating a vector store(s)')
docs = generate_vectorstore_docs_from_data_source(f'{document_dir}/**/*.txt')
with open(pickle_file, 'wb') as f:
pickle.dump(docs, file=f)
else:
print(f'*** Reading documents and Converting documents into embeddings and creating a vector store(s) from a pickled file ({pickle_file})')
with open(pickle_file, 'rb') as f:
docs = pickle.load(file=f)