-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
244 changed files
with
12,425 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import os | ||
import shutil | ||
from PyPDF2 import PdfFileReader, PdfFileWriter | ||
import PyPDF2 | ||
|
||
def split_pdf(input_path, output_directory, chunk_size): | ||
# Create output directory if it doesn't exist | ||
if not os.path.exists(output_directory): | ||
os.makedirs(output_directory) | ||
|
||
# Iterate over files in the input directory | ||
for filename in os.listdir(input_path): | ||
if filename.endswith(".pdf"): | ||
file_path = os.path.join(input_path, filename) | ||
|
||
# Read the input PDF file | ||
with open(file_path, 'rb') as input_file: | ||
pdf = PdfFileReader(input_file) | ||
|
||
# Determine the total number of pages in the PDF | ||
total_pages = pdf.getNumPages() | ||
|
||
# Calculate the number of chunks | ||
num_chunks = total_pages // chunk_size | ||
if total_pages % chunk_size != 0: | ||
num_chunks += 1 | ||
|
||
# Split the PDF into chunks | ||
for i in range(num_chunks): | ||
start_page = i * chunk_size | ||
end_page = min(start_page + chunk_size, total_pages) | ||
|
||
# Create a new PDF writer for each chunk | ||
output_pdf = PdfFileWriter() | ||
|
||
# Extract pages from the input PDF and add them to the chunk | ||
for page in range(start_page, end_page): | ||
output_pdf.addPage(pdf.getPage(page)) | ||
|
||
# Save the chunk to a new PDF file | ||
output_file_path = os.path.join(output_directory, f'{filename}_chunk_{i+1}.pdf') | ||
with open(output_file_path, 'wb') as output_file: | ||
output_pdf.write(output_file) | ||
|
||
print(f'Saved {output_file_path}') | ||
|
||
def convert_pdf_to_txt(input_directory, output_directory): | ||
# Iterate over each PDF file in the input directory | ||
for filename in os.listdir(input_directory): | ||
if filename.endswith(".pdf"): | ||
pdf_path = os.path.join(input_directory, filename) | ||
txt_path = os.path.join(output_directory, os.path.splitext(filename)[0] + ".txt") | ||
|
||
# Convert PDF to TXT | ||
try: | ||
with open(pdf_path, "rb") as pdf_file: | ||
reader = PyPDF2.PdfFileReader(pdf_file) | ||
text = "" | ||
for page_num in range(reader.numPages): | ||
page = reader.getPage(page_num) | ||
text += page.extract_text() | ||
|
||
os.makedirs(os.path.dirname(txt_path), exist_ok=True) | ||
|
||
with open(txt_path, "w", encoding="utf-8") as txt_file: | ||
txt_file.write(text) | ||
|
||
print(f"Successfully converted {filename} to {os.path.basename(txt_path)}") | ||
|
||
except Exception as e: | ||
print(f"Error converting {filename}: {e}") | ||
|
||
|
||
def AutoConvertPDFtotext(input_directory,chunk_size, output_directory): | ||
|
||
# Create a temporary directory for storing the intermediate PDF chunks | ||
temp_directory = 'temp_chunks' | ||
split_pdf(input_directory, temp_directory, chunk_size) | ||
|
||
# Convert the PDF chunks to TXT files | ||
convert_pdf_to_txt(temp_directory, output_directory) | ||
|
||
# Delete the temporary directory | ||
shutil.rmtree(temp_directory) | ||
|
||
|
||
|
||
## Testing ## | ||
|
||
|
||
pdf_dir = "scrap" | ||
doc_dir = "new_scrap" | ||
|
||
AutoConvertPDFtotext(pdf_dir,2,doc_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import os | ||
from PyPDF2 import PdfFileReader, PdfFileWriter | ||
|
||
def split_pdf(input_path, output_directory, chunk_size): | ||
# Create output directory if it doesn't exist | ||
if not os.path.exists(output_directory): | ||
os.makedirs(output_directory) | ||
|
||
# Iterate over files in the input directory | ||
for filename in os.listdir(input_path): | ||
if filename.endswith(".pdf"): | ||
file_path = os.path.join(input_path, filename) | ||
|
||
# Read the input PDF file | ||
with open(file_path, 'rb') as input_file: | ||
pdf = PdfFileReader(input_file) | ||
|
||
# Determine the total number of pages in the PDF | ||
total_pages = pdf.getNumPages() | ||
|
||
# Calculate the number of chunks | ||
num_chunks = total_pages // chunk_size | ||
if total_pages % chunk_size != 0: | ||
num_chunks += 1 | ||
|
||
# Split the PDF into chunks | ||
for i in range(num_chunks): | ||
start_page = i * chunk_size | ||
end_page = min(start_page + chunk_size, total_pages) | ||
|
||
# Create a new PDF writer for each chunk | ||
output_pdf = PdfFileWriter() | ||
|
||
# Extract pages from the input PDF and add them to the chunk | ||
for page in range(start_page, end_page): | ||
output_pdf.addPage(pdf.getPage(page)) | ||
|
||
# Save the chunk to a new PDF file | ||
output_file_path = os.path.join(output_directory, f'{filename}_chunk_{i+1}.pdf') | ||
with open(output_file_path, 'wb') as output_file: | ||
output_pdf.write(output_file) | ||
|
||
print(f'Saved {output_file_path}') | ||
|
||
# Usage example | ||
input_directory = 'path/to/input_directory' # Replace with the path to your input directory containing PDF files | ||
output_directory = 'path/to/output_directory' # Replace with the desired output directory | ||
chunk_size = 2 | ||
|
||
split_pdf(input_directory, output_directory, chunk_size) | ||
|
||
|
||
|
||
|
||
|
||
|
||
################################## OLD CODE TO CONVERT A SINGLE PDF FILE INTO MULTIPLE TEXT FILES ########################################### | ||
|
||
|
||
|
||
|
||
|
||
|
||
# import os | ||
# from PyPDF2 import PdfFileReader, PdfFileWriter | ||
|
||
# def split_pdf(input_path, output_directory, chunk_size): | ||
# # Create output directory if it doesn't exist | ||
# if not os.path.exists(output_directory): | ||
# os.makedirs(output_directory) | ||
|
||
# # Read the input PDF file | ||
# with open(input_path, 'rb') as input_file: | ||
# pdf = PdfFileReader(input_file) | ||
|
||
# # Determine the total number of pages in the PDF | ||
# total_pages = pdf.getNumPages() | ||
|
||
# # Calculate the number of chunks | ||
# num_chunks = total_pages // chunk_size | ||
# if total_pages % chunk_size != 0: | ||
# num_chunks += 1 | ||
|
||
# # Split the PDF into chunks | ||
# for i in range(num_chunks): | ||
# start_page = i * chunk_size | ||
# end_page = min(start_page + chunk_size, total_pages) | ||
|
||
# # Create a new PDF writer for each chunk | ||
# output_pdf = PdfFileWriter() | ||
|
||
# # Extract pages from the input PDF and add them to the chunk | ||
# for page in range(start_page, end_page): | ||
# output_pdf.addPage(pdf.getPage(page)) | ||
|
||
# # Save the chunk to a new PDF file | ||
# output_file_path = os.path.join(output_directory, f'chunk_{i+1}.pdf') | ||
# with open(output_file_path, 'wb') as output_file: | ||
# output_pdf.write(output_file) | ||
|
||
# print(f'Saved {output_file_path}') | ||
|
||
# # Usage example | ||
# input_path = 'new_scrap\PM-2020.pdf' # Replace with the path to your input PDF file | ||
# output_directory = 'final_scrap/' # Replace with the desired output directory | ||
# chunk_size = 2 | ||
|
||
# split_pdf(input_path, output_directory, chunk_size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
import PyPDF2 | ||
|
||
# Set the path to the folder containing the PDF files | ||
pdf_folder = "final_scrap/" | ||
|
||
# Set the path to the folder where you want to store the TXT files | ||
txt_folder = "sad_scrap/" | ||
|
||
# Iterate over each PDF file in the folder | ||
for filename in os.listdir(pdf_folder): | ||
if filename.endswith(".pdf"): | ||
pdf_path = os.path.join(pdf_folder, filename) | ||
txt_path = os.path.join(txt_folder, os.path.splitext(filename)[0] + ".txt") | ||
|
||
# Convert PDF to TXT | ||
try: | ||
with open(pdf_path, "rb") as pdf_file: | ||
reader = PyPDF2.PdfFileReader(pdf_file) | ||
text = "" | ||
for page_num in range(reader.numPages): | ||
page = reader.getPage(page_num) | ||
text += page.extract_text() | ||
|
||
os.makedirs(os.path.dirname(txt_path), exist_ok=True) | ||
|
||
with open(txt_path, "w", encoding="utf-8") as txt_file: | ||
txt_file.write(text) | ||
|
||
print(f"Successfully converted {filename} to {os.path.basename(txt_path)}") | ||
|
||
except Exception as e: | ||
print(f"Error converting {filename}: {e}") |
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# Using a FIASS document store | ||
from haystack.document_stores import FAISSDocumentStore | ||
|
||
# Load the saved index into a new DocumentStore instance: | ||
# Also, provide `config_path` parameter if you set it when calling the `save()` method: | ||
document_store = FAISSDocumentStore.load(index_path="docstore/my_index.faiss", config_path="docstore/my_config.json") | ||
|
||
# Check if the DocumentStore is loaded correctly | ||
assert document_store.faiss_index_factory_str == "Flat" | ||
|
||
|
||
# Initilazing prompt node from the start to avoid delays later : | ||
from haystack.nodes import PromptNode,PromptTemplate | ||
from haystack.pipelines import Pipeline | ||
|
||
# # Initializing agent and tools | ||
# from haystack.agents import Agent, Tool | ||
# from haystack.agents.base import ToolsManager | ||
|
||
|
||
from haystack.nodes import EmbeddingRetriever | ||
retriever = EmbeddingRetriever( | ||
document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1" | ||
) | ||
# This embedding retreiver gave the wrong file for "What are the different billing methdos in SD ??" | ||
|
||
# from haystack.nodes import BM25Retriever | ||
# retriever = BM25Retriever(document_store=document_store, top_k=2) | ||
|
||
|
||
# qa_template = PromptTemplate( | ||
# name="Question_and_Answer", | ||
# prompt_text=""" | ||
# You are an AI assistant. Your task is to use the content to give a detailed and easily understandable answer | ||
# Content: {input}\n\n | ||
# Answer: | ||
# """ | ||
# ) | ||
|
||
lfqa_prompt = PromptTemplate( | ||
name="lfqa", | ||
prompt_text="""Synthesize a comprehensive answer from the following text for the given question. | ||
Provide a clear and concise response that summarizes the key points and information presented in the text. | ||
Your answer should be in your own words and be no longer than 50 words. | ||
\n\n Related text: {join(documents)} \n\n Question: {query} \n\n | ||
Final Answer:""", | ||
) | ||
|
||
prompt_node_working = PromptNode("gpt-3.5-turbo", api_key="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", default_prompt_template=lfqa_prompt,model_kwargs={"stream":True}) | ||
|
||
# prompt_node = PromptNode("distilbert-base-cased-distilled-squad",default_prompt_template=lfqa_prompt,model_kwargs={"stream":True}) | ||
|
||
# from haystack.nodes import OpenAIAnswerGenerator | ||
# generator = OpenAIAnswerGenerator(api_key="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") | ||
|
||
# from haystack.pipelines import GenerativeQAPipeline | ||
|
||
# pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever) | ||
# result = pipeline.run(query='How to create a sales order', params={"Retriever": {"top_k": 1}}) | ||
|
||
query_pipeline = Pipeline() | ||
query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) | ||
|
||
query_pipeline.add_node(component=prompt_node_working, name="prompt_node", inputs=["Retriever"]) | ||
|
||
|
||
## This works perfectly for lfqa, Maybe | ||
|
||
|
||
# Creating a funciton to integrate all this : | ||
|
||
def question_answering_bot(input_question): | ||
answer = query_pipeline.run(query=input_question, params={"Retriever": {"top_k": 3}}) | ||
|
||
# # Assuming 'answer' is a Document object | ||
# response = { | ||
# 'text': answer.text, | ||
# 'start': answer.start, | ||
# 'end': answer.end, | ||
# 'score': answer.score, | ||
# } | ||
# return response | ||
|
||
return answer["results"] | ||
|
||
|
||
|
||
# # Extract the 'content' value from each document | ||
# contents = [doc.content for doc in result['documents']] | ||
|
||
# # Print the contents | ||
# for content in contents: | ||
# print(content) | ||
|
||
# # Extract the 'content' value from each document | ||
# contents = [doc.content for doc in result['documents']] | ||
|
||
# # Join all the content values into a single string | ||
# joined_content = '\n'.join(contents) | ||
|
||
# result = prompt_node.prompt(prompt_template=qa_template, input=joined_content) | ||
# print(result) | ||
# query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["Retriever"]) | ||
|
||
# hotpot_questions = [ | ||
# "What are the different billing methods ?" | ||
# ] | ||
|
||
|
||
# for question in hotpot_questions: | ||
# output = query_pipeline.run(query=question) | ||
# print(output["results"]) | ||
|
||
|
||
# while(True): | ||
# input_question = input("Enter the quesiton which you want to ask the bot : ") | ||
# if(input_question=="#"): | ||
# print("thank you ") | ||
# break | ||
# else: | ||
# question_answering_bot(input_question) | ||
|
||
## Testing llms | ||
|
||
|
||
reply = question_answering_bot("Expalin the policies of performance security bond ?") | ||
print("\n\n\n RESULT\n") | ||
print(reply) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"faiss_index_factory_str": "Flat"} |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.