-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
88 lines (57 loc) · 3.23 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import logging
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
from haystack import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
import os
# PDFToConverter = PDFToTextConverter()
text_converter = TextConverter()
pre_processor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=False,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
)
# # Using inmemory document store :
# from haystack.document_stores import InMemoryDocumentStore
# document_store = InMemoryDocumentStore(use_bm25=True)
# Using FIASS document store
from haystack.document_stores import FAISSDocumentStore
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
# from haystack.nodes import EmbeddingRetriever
# preprocessing_retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", model_format="sentence_transformers", top_k=20)
print("started")
indexing_pipeline = Pipeline()
# indexing_pipeline.add_node(component=PDFToTextConverter, name="PDFToTextConverter", inputs=["File"])
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=pre_processor, name="PreProcessor", inputs=["TextConverter"])
# indexing_pipeline.add_node(component=document_store, name="InMemoryDocs", inputs=["TextConverter"])
# indexing_pipeline.add_node(component=preprocessing_retriever, name="Retriever_for_embeddings", inputs=["PreProcessor"])
indexing_pipeline.add_node(component=document_store, name="FIASS_Docstore", inputs=["TextConverter"])
doc_dir = "scrap"
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
for file_path in files_to_index:
print(file_path)
# for file in files_to_index[]:
# print(file)
print("Indexing pipeline started")
indexing_pipeline.run(file_paths=files_to_index)
print("Indexing pipeline Successfully completed")
# After creating the FIass saving it so that we dont have to run the indexing pipeline again and again once we have the data
from haystack.nodes import EmbeddingRetriever
retriever = EmbeddingRetriever(
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
document_store.update_embeddings(retriever)
# # Important:
# # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all previously indexed documents and update their embedding representation.
# # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
# Save the document store:
document_store.save(index_path="data/my_index.faiss", config_path="data/my_config.json")
# Saving the document store creates two files: my_faiss_index.faiss and my_faiss_index.json