Skip to content

Commit

Permalink
Stage 1 of project complete
Browse files Browse the repository at this point in the history
  • Loading branch information
br0hit committed Jun 29, 2023
1 parent 05092f1 commit daf1320
Show file tree
Hide file tree
Showing 244 changed files with 12,425 additions and 0 deletions.
94 changes: 94 additions & 0 deletions AutoPDFconversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import shutil
from PyPDF2 import PdfFileReader, PdfFileWriter
import PyPDF2

def split_pdf(input_path, output_directory, chunk_size):
# Create output directory if it doesn't exist
if not os.path.exists(output_directory):
os.makedirs(output_directory)

# Iterate over files in the input directory
for filename in os.listdir(input_path):
if filename.endswith(".pdf"):
file_path = os.path.join(input_path, filename)

# Read the input PDF file
with open(file_path, 'rb') as input_file:
pdf = PdfFileReader(input_file)

# Determine the total number of pages in the PDF
total_pages = pdf.getNumPages()

# Calculate the number of chunks
num_chunks = total_pages // chunk_size
if total_pages % chunk_size != 0:
num_chunks += 1

# Split the PDF into chunks
for i in range(num_chunks):
start_page = i * chunk_size
end_page = min(start_page + chunk_size, total_pages)

# Create a new PDF writer for each chunk
output_pdf = PdfFileWriter()

# Extract pages from the input PDF and add them to the chunk
for page in range(start_page, end_page):
output_pdf.addPage(pdf.getPage(page))

# Save the chunk to a new PDF file
output_file_path = os.path.join(output_directory, f'{filename}_chunk_{i+1}.pdf')
with open(output_file_path, 'wb') as output_file:
output_pdf.write(output_file)

print(f'Saved {output_file_path}')

def convert_pdf_to_txt(input_directory, output_directory):
# Iterate over each PDF file in the input directory
for filename in os.listdir(input_directory):
if filename.endswith(".pdf"):
pdf_path = os.path.join(input_directory, filename)
txt_path = os.path.join(output_directory, os.path.splitext(filename)[0] + ".txt")

# Convert PDF to TXT
try:
with open(pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfFileReader(pdf_file)
text = ""
for page_num in range(reader.numPages):
page = reader.getPage(page_num)
text += page.extract_text()

os.makedirs(os.path.dirname(txt_path), exist_ok=True)

with open(txt_path, "w", encoding="utf-8") as txt_file:
txt_file.write(text)

print(f"Successfully converted {filename} to {os.path.basename(txt_path)}")

except Exception as e:
print(f"Error converting {filename}: {e}")


def AutoConvertPDFtotext(input_directory,chunk_size, output_directory):

# Create a temporary directory for storing the intermediate PDF chunks
temp_directory = 'temp_chunks'
split_pdf(input_directory, temp_directory, chunk_size)

# Convert the PDF chunks to TXT files
convert_pdf_to_txt(temp_directory, output_directory)

# Delete the temporary directory
shutil.rmtree(temp_directory)



## Testing ##


pdf_dir = "scrap"
doc_dir = "new_scrap"

AutoConvertPDFtotext(pdf_dir,2,doc_dir)
108 changes: 108 additions & 0 deletions Junk code/PDFConverter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import os
from PyPDF2 import PdfFileReader, PdfFileWriter

def split_pdf(input_path, output_directory, chunk_size):
# Create output directory if it doesn't exist
if not os.path.exists(output_directory):
os.makedirs(output_directory)

# Iterate over files in the input directory
for filename in os.listdir(input_path):
if filename.endswith(".pdf"):
file_path = os.path.join(input_path, filename)

# Read the input PDF file
with open(file_path, 'rb') as input_file:
pdf = PdfFileReader(input_file)

# Determine the total number of pages in the PDF
total_pages = pdf.getNumPages()

# Calculate the number of chunks
num_chunks = total_pages // chunk_size
if total_pages % chunk_size != 0:
num_chunks += 1

# Split the PDF into chunks
for i in range(num_chunks):
start_page = i * chunk_size
end_page = min(start_page + chunk_size, total_pages)

# Create a new PDF writer for each chunk
output_pdf = PdfFileWriter()

# Extract pages from the input PDF and add them to the chunk
for page in range(start_page, end_page):
output_pdf.addPage(pdf.getPage(page))

# Save the chunk to a new PDF file
output_file_path = os.path.join(output_directory, f'{filename}_chunk_{i+1}.pdf')
with open(output_file_path, 'wb') as output_file:
output_pdf.write(output_file)

print(f'Saved {output_file_path}')

# Usage example
input_directory = 'path/to/input_directory' # Replace with the path to your input directory containing PDF files
output_directory = 'path/to/output_directory' # Replace with the desired output directory
chunk_size = 2

split_pdf(input_directory, output_directory, chunk_size)






################################## OLD CODE TO CONVERT A SINGLE PDF FILE INTO MULTIPLE TEXT FILES ###########################################






# import os
# from PyPDF2 import PdfFileReader, PdfFileWriter

# def split_pdf(input_path, output_directory, chunk_size):
# # Create output directory if it doesn't exist
# if not os.path.exists(output_directory):
# os.makedirs(output_directory)

# # Read the input PDF file
# with open(input_path, 'rb') as input_file:
# pdf = PdfFileReader(input_file)

# # Determine the total number of pages in the PDF
# total_pages = pdf.getNumPages()

# # Calculate the number of chunks
# num_chunks = total_pages // chunk_size
# if total_pages % chunk_size != 0:
# num_chunks += 1

# # Split the PDF into chunks
# for i in range(num_chunks):
# start_page = i * chunk_size
# end_page = min(start_page + chunk_size, total_pages)

# # Create a new PDF writer for each chunk
# output_pdf = PdfFileWriter()

# # Extract pages from the input PDF and add them to the chunk
# for page in range(start_page, end_page):
# output_pdf.addPage(pdf.getPage(page))

# # Save the chunk to a new PDF file
# output_file_path = os.path.join(output_directory, f'chunk_{i+1}.pdf')
# with open(output_file_path, 'wb') as output_file:
# output_pdf.write(output_file)

# print(f'Saved {output_file_path}')

# # Usage example
# input_path = 'new_scrap\PM-2020.pdf' # Replace with the path to your input PDF file
# output_directory = 'final_scrap/' # Replace with the desired output directory
# chunk_size = 2

# split_pdf(input_path, output_directory, chunk_size)
33 changes: 33 additions & 0 deletions Junk code/PDFtotextConverter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import PyPDF2

# Set the path to the folder containing the PDF files
pdf_folder = "final_scrap/"

# Set the path to the folder where you want to store the TXT files
txt_folder = "sad_scrap/"

# Iterate over each PDF file in the folder
for filename in os.listdir(pdf_folder):
if filename.endswith(".pdf"):
pdf_path = os.path.join(pdf_folder, filename)
txt_path = os.path.join(txt_folder, os.path.splitext(filename)[0] + ".txt")

# Convert PDF to TXT
try:
with open(pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfFileReader(pdf_file)
text = ""
for page_num in range(reader.numPages):
page = reader.getPage(page_num)
text += page.extract_text()

os.makedirs(os.path.dirname(txt_path), exist_ok=True)

with open(txt_path, "w", encoding="utf-8") as txt_file:
txt_file.write(text)

print(f"Successfully converted {filename} to {os.path.basename(txt_path)}")

except Exception as e:
print(f"Error converting {filename}: {e}")
Binary file added PM-2020.pdf
Binary file not shown.
Empty file added __init__.py
Empty file.
Binary file added __pycache__/AutoPDFconversion.cpython-310.pyc
Binary file not shown.
Binary file added __pycache__/answer_generator.cpython-310.pyc
Binary file not shown.
128 changes: 128 additions & 0 deletions answer_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Using a FIASS document store
from haystack.document_stores import FAISSDocumentStore

# Load the saved index into a new DocumentStore instance:
# Also, provide `config_path` parameter if you set it when calling the `save()` method:
document_store = FAISSDocumentStore.load(index_path="docstore/my_index.faiss", config_path="docstore/my_config.json")

# Check if the DocumentStore is loaded correctly
assert document_store.faiss_index_factory_str == "Flat"


# Initilazing prompt node from the start to avoid delays later :
from haystack.nodes import PromptNode,PromptTemplate
from haystack.pipelines import Pipeline

# # Initializing agent and tools
# from haystack.agents import Agent, Tool
# from haystack.agents.base import ToolsManager


from haystack.nodes import EmbeddingRetriever
retriever = EmbeddingRetriever(
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
# This embedding retreiver gave the wrong file for "What are the different billing methdos in SD ??"

# from haystack.nodes import BM25Retriever
# retriever = BM25Retriever(document_store=document_store, top_k=2)


# qa_template = PromptTemplate(
# name="Question_and_Answer",
# prompt_text="""
# You are an AI assistant. Your task is to use the content to give a detailed and easily understandable answer
# Content: {input}\n\n
# Answer:
# """
# )

lfqa_prompt = PromptTemplate(
name="lfqa",
prompt_text="""Synthesize a comprehensive answer from the following text for the given question.
Provide a clear and concise response that summarizes the key points and information presented in the text.
Your answer should be in your own words and be no longer than 50 words.
\n\n Related text: {join(documents)} \n\n Question: {query} \n\n
Final Answer:""",
)

prompt_node_working = PromptNode("gpt-3.5-turbo", api_key="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", default_prompt_template=lfqa_prompt,model_kwargs={"stream":True})

# prompt_node = PromptNode("distilbert-base-cased-distilled-squad",default_prompt_template=lfqa_prompt,model_kwargs={"stream":True})

# from haystack.nodes import OpenAIAnswerGenerator
# generator = OpenAIAnswerGenerator(api_key="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")

# from haystack.pipelines import GenerativeQAPipeline

# pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever)
# result = pipeline.run(query='How to create a sales order', params={"Retriever": {"top_k": 1}})

query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])

query_pipeline.add_node(component=prompt_node_working, name="prompt_node", inputs=["Retriever"])


## This works perfectly for lfqa, Maybe


# Creating a funciton to integrate all this :

def question_answering_bot(input_question):
answer = query_pipeline.run(query=input_question, params={"Retriever": {"top_k": 3}})

# # Assuming 'answer' is a Document object
# response = {
# 'text': answer.text,
# 'start': answer.start,
# 'end': answer.end,
# 'score': answer.score,
# }
# return response

return answer["results"]



# # Extract the 'content' value from each document
# contents = [doc.content for doc in result['documents']]

# # Print the contents
# for content in contents:
# print(content)

# # Extract the 'content' value from each document
# contents = [doc.content for doc in result['documents']]

# # Join all the content values into a single string
# joined_content = '\n'.join(contents)

# result = prompt_node.prompt(prompt_template=qa_template, input=joined_content)
# print(result)
# query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Retriever"])

# hotpot_questions = [
# "What are the different billing methods ?"
# ]


# for question in hotpot_questions:
# output = query_pipeline.run(query=question)
# print(output["results"])


# while(True):
# input_question = input("Enter the quesiton which you want to ask the bot : ")
# if(input_question=="#"):
# print("thank you ")
# break
# else:
# question_answering_bot(input_question)

## Testing llms


reply = question_answering_bot("Expalin the policies of performance security bond ?")
print("\n\n\n RESULT\n")
print(reply)
Binary file added audio.wav
Binary file not shown.
1 change: 1 addition & 0 deletions docstore/my_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"faiss_index_factory_str": "Flat"}
Binary file added docstore/my_index.faiss
Binary file not shown.
Binary file added faiss_document_store.db
Binary file not shown.
Loading

0 comments on commit daf1320

Please sign in to comment.