Stage 1 of project complete

br0hit · Jun 29, 2023 · daf1320 · daf1320
1 parent 05092f1
commit daf1320
Show file tree

Hide file tree

Showing 244 changed files with 12,425 additions and 0 deletions.
diff --git a/AutoPDFconversion.py b/AutoPDFconversion.py
@@ -0,0 +1,94 @@
+import os
+import shutil
+from PyPDF2 import PdfFileReader, PdfFileWriter
+import PyPDF2
+
+def split_pdf(input_path, output_directory, chunk_size):
+    # Create output directory if it doesn't exist
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+
+    # Iterate over files in the input directory
+    for filename in os.listdir(input_path):
+        if filename.endswith(".pdf"):
+            file_path = os.path.join(input_path, filename)
+
+            # Read the input PDF file
+            with open(file_path, 'rb') as input_file:
+                pdf = PdfFileReader(input_file)
+
+                # Determine the total number of pages in the PDF
+                total_pages = pdf.getNumPages()
+
+                # Calculate the number of chunks
+                num_chunks = total_pages // chunk_size
+                if total_pages % chunk_size != 0:
+                    num_chunks += 1
+
+                # Split the PDF into chunks
+                for i in range(num_chunks):
+                    start_page = i * chunk_size
+                    end_page = min(start_page + chunk_size, total_pages)
+
+                    # Create a new PDF writer for each chunk
+                    output_pdf = PdfFileWriter()
+
+                    # Extract pages from the input PDF and add them to the chunk
+                    for page in range(start_page, end_page):
+                        output_pdf.addPage(pdf.getPage(page))
+
+                    # Save the chunk to a new PDF file
+                    output_file_path = os.path.join(output_directory, f'{filename}_chunk_{i+1}.pdf')
+                    with open(output_file_path, 'wb') as output_file:
+                        output_pdf.write(output_file)
+
+                    print(f'Saved {output_file_path}')
+
+def convert_pdf_to_txt(input_directory, output_directory):
+    # Iterate over each PDF file in the input directory
+    for filename in os.listdir(input_directory):
+        if filename.endswith(".pdf"):
+            pdf_path = os.path.join(input_directory, filename)
+            txt_path = os.path.join(output_directory, os.path.splitext(filename)[0] + ".txt")
+
+            # Convert PDF to TXT
+            try:
+                with open(pdf_path, "rb") as pdf_file:
+                    reader = PyPDF2.PdfFileReader(pdf_file)
+                    text = ""
+                    for page_num in range(reader.numPages):
+                        page = reader.getPage(page_num)
+                        text += page.extract_text()
+
+                os.makedirs(os.path.dirname(txt_path), exist_ok=True)
+
+                with open(txt_path, "w", encoding="utf-8") as txt_file:
+                    txt_file.write(text)
+
+                print(f"Successfully converted {filename} to {os.path.basename(txt_path)}")
+
+            except Exception as e:
+                print(f"Error converting {filename}: {e}")
+
+
+def AutoConvertPDFtotext(input_directory,chunk_size, output_directory):
+
+    # Create a temporary directory for storing the intermediate PDF chunks
+    temp_directory = 'temp_chunks'
+    split_pdf(input_directory, temp_directory, chunk_size)
+
+    # Convert the PDF chunks to TXT files
+    convert_pdf_to_txt(temp_directory, output_directory)
+
+    # Delete the temporary directory
+    shutil.rmtree(temp_directory)
+
+
+
+##   Testing   ##
+
+
+pdf_dir = "scrap"   
+doc_dir = "new_scrap"
+
+AutoConvertPDFtotext(pdf_dir,2,doc_dir)
diff --git a/Junk code/PDFConverter.py b/Junk code/PDFConverter.py
@@ -0,0 +1,108 @@
+import os
+from PyPDF2 import PdfFileReader, PdfFileWriter
+
+def split_pdf(input_path, output_directory, chunk_size):
+    # Create output directory if it doesn't exist
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+
+    # Iterate over files in the input directory
+    for filename in os.listdir(input_path):
+        if filename.endswith(".pdf"):
+            file_path = os.path.join(input_path, filename)
+
+            # Read the input PDF file
+            with open(file_path, 'rb') as input_file:
+                pdf = PdfFileReader(input_file)
+
+                # Determine the total number of pages in the PDF
+                total_pages = pdf.getNumPages()
+
+                # Calculate the number of chunks
+                num_chunks = total_pages // chunk_size
+                if total_pages % chunk_size != 0:
+                    num_chunks += 1
+
+                # Split the PDF into chunks
+                for i in range(num_chunks):
+                    start_page = i * chunk_size
+                    end_page = min(start_page + chunk_size, total_pages)
+
+                    # Create a new PDF writer for each chunk
+                    output_pdf = PdfFileWriter()
+
+                    # Extract pages from the input PDF and add them to the chunk
+                    for page in range(start_page, end_page):
+                        output_pdf.addPage(pdf.getPage(page))
+
+                    # Save the chunk to a new PDF file
+                    output_file_path = os.path.join(output_directory, f'{filename}_chunk_{i+1}.pdf')
+                    with open(output_file_path, 'wb') as output_file:
+                        output_pdf.write(output_file)
+
+                    print(f'Saved {output_file_path}')
+
+# Usage example
+input_directory = 'path/to/input_directory'  # Replace with the path to your input directory containing PDF files
+output_directory = 'path/to/output_directory'  # Replace with the desired output directory
+chunk_size = 2
+
+split_pdf(input_directory, output_directory, chunk_size)
+
+
+
+
+
+
+##################################    OLD CODE TO CONVERT A SINGLE PDF FILE INTO MULTIPLE TEXT FILES   ###########################################
+
+
+
+
+
+
+# import os
+# from PyPDF2 import PdfFileReader, PdfFileWriter
+
+# def split_pdf(input_path, output_directory, chunk_size):
+#     # Create output directory if it doesn't exist
+#     if not os.path.exists(output_directory):
+#         os.makedirs(output_directory)
+
+#     # Read the input PDF file
+#     with open(input_path, 'rb') as input_file:
+#         pdf = PdfFileReader(input_file)
+
+#         # Determine the total number of pages in the PDF
+#         total_pages = pdf.getNumPages()
+
+#         # Calculate the number of chunks
+#         num_chunks = total_pages // chunk_size
+#         if total_pages % chunk_size != 0:
+#             num_chunks += 1
+
+#         # Split the PDF into chunks
+#         for i in range(num_chunks):
+#             start_page = i * chunk_size
+#             end_page = min(start_page + chunk_size, total_pages)
+
+#             # Create a new PDF writer for each chunk
+#             output_pdf = PdfFileWriter()
+
+#             # Extract pages from the input PDF and add them to the chunk
+#             for page in range(start_page, end_page):
+#                 output_pdf.addPage(pdf.getPage(page))
+
+#             # Save the chunk to a new PDF file
+#             output_file_path = os.path.join(output_directory, f'chunk_{i+1}.pdf')
+#             with open(output_file_path, 'wb') as output_file:
+#                 output_pdf.write(output_file)
+
+#             print(f'Saved {output_file_path}')
+
+# # Usage example
+# input_path = 'new_scrap\PM-2020.pdf'  # Replace with the path to your input PDF file
+# output_directory = 'final_scrap/'  # Replace with the desired output directory
+# chunk_size = 2
+
+# split_pdf(input_path, output_directory, chunk_size)
diff --git a/Junk code/PDFtotextConverter.py b/Junk code/PDFtotextConverter.py
@@ -0,0 +1,33 @@
+import os
+import PyPDF2
+
+# Set the path to the folder containing the PDF files
+pdf_folder = "final_scrap/"
+
+# Set the path to the folder where you want to store the TXT files
+txt_folder = "sad_scrap/"
+
+# Iterate over each PDF file in the folder
+for filename in os.listdir(pdf_folder):
+    if filename.endswith(".pdf"):
+        pdf_path = os.path.join(pdf_folder, filename)
+        txt_path = os.path.join(txt_folder, os.path.splitext(filename)[0] + ".txt")
+
+        # Convert PDF to TXT
+        try:
+            with open(pdf_path, "rb") as pdf_file:
+                reader = PyPDF2.PdfFileReader(pdf_file)
+                text = ""
+                for page_num in range(reader.numPages):
+                    page = reader.getPage(page_num)
+                    text += page.extract_text()
+
+            os.makedirs(os.path.dirname(txt_path), exist_ok=True)
+
+            with open(txt_path, "w", encoding="utf-8") as txt_file:
+                txt_file.write(text)
+
+            print(f"Successfully converted {filename} to {os.path.basename(txt_path)}")
+
+        except Exception as e:
+            print(f"Error converting {filename}: {e}")
diff --git a/PM-2020.pdf b/PM-2020.pdf
diff --git a/__init__.py b/__init__.py
diff --git a/__pycache__/AutoPDFconversion.cpython-310.pyc b/__pycache__/AutoPDFconversion.cpython-310.pyc
diff --git a/__pycache__/answer_generator.cpython-310.pyc b/__pycache__/answer_generator.cpython-310.pyc
diff --git a/answer_generator.py b/answer_generator.py
@@ -0,0 +1,128 @@
+# Using a FIASS document store 
+from haystack.document_stores import FAISSDocumentStore
+
+# Load the saved index into a new DocumentStore instance:
+# Also, provide `config_path` parameter if you set it when calling the `save()` method: 
+document_store = FAISSDocumentStore.load(index_path="docstore/my_index.faiss", config_path="docstore/my_config.json")
+
+# Check if the DocumentStore is loaded correctly
+assert document_store.faiss_index_factory_str == "Flat"
+
+
+# Initilazing prompt node from the start to avoid delays later : 
+from haystack.nodes import PromptNode,PromptTemplate
+from haystack.pipelines import Pipeline
+
+# # Initializing agent and tools 
+# from haystack.agents import Agent, Tool
+# from haystack.agents.base import ToolsManager
+
+
+from haystack.nodes import EmbeddingRetriever
+retriever = EmbeddingRetriever(
+    document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
+)
+# This embedding retreiver gave the wrong file for "What are the different billing methdos in SD ??"
+
+# from haystack.nodes import BM25Retriever
+# retriever = BM25Retriever(document_store=document_store, top_k=2)
+
+
+# qa_template = PromptTemplate(
+#     name="Question_and_Answer",
+#     prompt_text="""
+#     You are an AI assistant. Your task is to use the content to give a detailed and easily understandable answer  
+#     Content: {input}\n\n
+#     Answer:
+#     """
+# )
+
+lfqa_prompt = PromptTemplate(
+    name="lfqa",
+    prompt_text="""Synthesize a comprehensive answer from the following text for the given question. 
+                             Provide a clear and concise response that summarizes the key points and information presented in the text. 
+                             Your answer should be in your own words and be no longer than 50 words. 
+                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n 
+                             Final Answer:""",
+)
+
+prompt_node_working = PromptNode("gpt-3.5-turbo", api_key="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", default_prompt_template=lfqa_prompt,model_kwargs={"stream":True})
+
+# prompt_node = PromptNode("distilbert-base-cased-distilled-squad",default_prompt_template=lfqa_prompt,model_kwargs={"stream":True})
+
+# from haystack.nodes import OpenAIAnswerGenerator
+# generator = OpenAIAnswerGenerator(api_key="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")
+
+# from haystack.pipelines import GenerativeQAPipeline
+
+# pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever)
+# result = pipeline.run(query='How to create a sales order', params={"Retriever": {"top_k": 1}})
+
+query_pipeline = Pipeline()
+query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
+
+query_pipeline.add_node(component=prompt_node_working, name="prompt_node", inputs=["Retriever"])
+
+
+## This works perfectly for lfqa, Maybe 
+
+
+# Creating a funciton to integrate all this : 
+
+def question_answering_bot(input_question):
+    answer = query_pipeline.run(query=input_question, params={"Retriever": {"top_k": 3}})
+
+    # # Assuming 'answer' is a Document object
+    # response = {
+    #     'text': answer.text,
+    #     'start': answer.start,
+    #     'end': answer.end,
+    #     'score': answer.score,
+    # }
+    # return response
+
+    return answer["results"]
+
+
+
+# # Extract the 'content' value from each document
+# contents = [doc.content for doc in result['documents']]
+
+# # Print the contents
+# for content in contents:
+#     print(content)
+
+# # Extract the 'content' value from each document
+# contents = [doc.content for doc in result['documents']]
+
+# # Join all the content values into a single string
+# joined_content = '\n'.join(contents)
+
+# result = prompt_node.prompt(prompt_template=qa_template, input=joined_content)
+# print(result)
+# query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Retriever"])
+
+# hotpot_questions = [
+#     "What are the different billing methods  ?"
+# ]
+
+
+# for question in hotpot_questions:
+#     output = query_pipeline.run(query=question)
+#     print(output["results"])
+
+
+    # while(True):
+    #     input_question = input("Enter the quesiton which you want to ask the bot : ")
+    #     if(input_question=="#"):
+    #         print("thank you ")
+    #         break
+    #     else:
+    #         question_answering_bot(input_question)
+
+    ## Testing llms 
+
+
+reply = question_answering_bot("Expalin the policies of performance security bond ?")
+print("\n\n\n RESULT\n")
+print(reply)
diff --git a/audio.wav b/audio.wav
diff --git a/docstore/my_config.json b/docstore/my_config.json
@@ -0,0 +1 @@
+{"faiss_index_factory_str": "Flat"}
diff --git a/docstore/my_index.faiss b/docstore/my_index.faiss
diff --git a/faiss_document_store.db b/faiss_document_store.db