Skip to content

Commit

Permalink
Merge pull request #12 from electify-eu/bundestag-2025
Browse files Browse the repository at this point in the history
Fixed SEO and pdf parsing issue with bold text, removed AfD disclaimer if party name are hidden
  • Loading branch information
cliedl authored Feb 11, 2025
2 parents 18b8b80 + 6b73d39 commit df77ba6
Show file tree
Hide file tree
Showing 13 changed files with 434 additions and 480 deletions.
201 changes: 44 additions & 157 deletions App.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ RUN uv sync --frozen --no-dev
# Activate virtual environment
ENV PATH="/app/.venv/bin:$PATH"

# Copy custom index
COPY streamlit_app/index.html /usr/local/lib/python3.11/site-packages/streamlit/static/index.html
# Copy custom index.html with SEO tags
RUN cp streamlit_app/index.html $(python -c "import streamlit; import os; print(os.path.dirname(streamlit.__file__))")/static/index.html

# Expose port 8080 to world outside of the container
EXPOSE 8080
Expand Down
262 changes: 131 additions & 131 deletions RAG/database/vector_database.py
Original file line number Diff line number Diff line change
@@ -1,143 +1,143 @@
import glob
import os

from langchain_community.document_loaders import PDFMinerLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter


class VectorDatabase:
def __init__(
self,
embedding_model,
source_type, # "manifestos" or "debates"
data_path=".",
database_directory="./chroma",
chunk_size=1000,
chunk_overlap=200,
loader="pdf",
reload=True,
):
"""
Initializes the VectorDatabase.
Parameters:
- embedding_model: The model used to generate embeddings for the documents.
- data_directory (str): The directory where the source documents are located. Defaults to the current directory.
- database_directory (str): The directory to store the Chroma database. Defaults to './chroma'.
- chunk_size (int): The size of text chunks to split the documents into. Defaults to 1000.
- chunk_overlap (int): The number of characters to overlap between adjacent chunks. Defaults to 100.
- loader(str): "pdf" or "csv", depending on data format
"""

self.embedding_model = embedding_model
self.source_type = source_type
self.data_path = data_path
self.database_directory = database_directory
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.loader = loader

if reload:
self.database = self.load_database()

def load_database(self):
"""
Loads an existing Chroma database.
Returns:
- The loaded Chroma database.
"""
if os.path.exists(self.database_directory):
self.database = Chroma(persist_directory=self.database_directory, embedding_function=self.embedding_model)
print("reloaded database")
else:
raise AssertionError(f"{self.database_directory} does not include database.")

return self.database

def build_database(self, overwrite=True):
"""
Builds a new Chroma database from the documents in the data directory.
Parameters:
- loader: Optional, a document loader instance. If None, PyPDFDirectoryLoader will be used with the data_directory.
Returns:
- The newly built Chroma database.
"""
# # If overwrite flag is true, remove old databases from directory if they exist
# if overwrite:
# if os.path.exists(self.database_directory):
# shutil.rmtree(self.database_directory)
# time.sleep(1)

# PDF is the default loader defined above

if os.path.exists(self.database_directory):
raise AssertionError("Delete old database first and restart session!")

# Define text_splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

if self.loader == "pdf":
# loader = PyPDFDirectoryLoader(self.data_path)
# get file_paths of all pdfs in data_folder
pdf_paths = glob.glob(os.path.join(self.data_path, "*.pdf"))

splits = []
for pdf_path in pdf_paths:
file_name = os.path.basename(pdf_path)
party = file_name.split("_")[0]

# Load pdf as single doc
loader = PDFMinerLoader(pdf_path, concatenate_pages=True)
doc = loader.load()

# Also load pdf as individual pages, this is important to extract the page number later
loader = PDFMinerLoader(pdf_path, concatenate_pages=False)
doc_pages = loader.load()

# Add party to metadata
for i in range(len(doc)):
doc[i].metadata.update({"party": party})

# Create splits
splits_temp = text_splitter.split_documents(doc)

# For each split, we search for the page on which it has occurred
for split in splits_temp:
for page_number, doc_page in enumerate(doc_pages):
# Create first and second half of split
split_1 = split.page_content[: int(0.5 * len(split.page_content))]
split_2 = split.page_content[int(0.5 * len(split.page_content)) :]
# If the first half is on page page_number or the second half is on page page_number, set page=page_number
if split_1 in doc_page.page_content or split_2 in doc_page.page_content:
split.metadata.update({"page": page_number})
if split.metadata.get("page") is None:
split.metadata.update({"page": 1})

splits.extend(splits_temp)

elif self.loader == "csv":
loader = CSVLoader(self.data_path, metadata_columns=["date", "fullName", "politicalGroup", "party"])
# Load documents
docs = loader.load()

# Create splits
splits = text_splitter.split_documents(docs)

# Create database
self.database = Chroma.from_documents(
splits,
self.embedding_model,
persist_directory=self.database_directory,
collection_metadata={"hnsw:space": "cosine"},
)

return self.database
def __init__(
self,
embedding_model,
source_type, # "manifestos" or "debates"
data_path=".",
database_directory="./chroma",
chunk_size=1000,
chunk_overlap=200,
loader="pdf",
reload=True,
):
"""
Initializes the VectorDatabase.
Parameters:
- embedding_model: The model used to generate embeddings for the documents.
- data_directory (str): The directory where the source documents are located. Defaults to the current directory.
- database_directory (str): The directory to store the Chroma database. Defaults to './chroma'.
- chunk_size (int): The size of text chunks to split the documents into. Defaults to 1000.
- chunk_overlap (int): The number of characters to overlap between adjacent chunks. Defaults to 100.
- loader(str): "pdf" or "csv", depending on data format
"""

self.embedding_model = embedding_model
self.source_type = source_type
self.data_path = data_path
self.database_directory = database_directory
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.loader = loader

if reload:
self.database = self.load_database()

def load_database(self):
"""
Loads an existing Chroma database.
Returns:
- The loaded Chroma database.
"""
if os.path.exists(self.database_directory):
self.database = Chroma(persist_directory=self.database_directory, embedding_function=self.embedding_model)
print("reloaded database")
else:
raise AssertionError(f"{self.database_directory} does not include database.")

return self.database

def build_database(self, overwrite=True):
"""
Builds a new Chroma database from the documents in the data directory.
Parameters:
- loader: Optional, a document loader instance. If None, PyPDFDirectoryLoader will be used with the data_directory.
Returns:
- The newly built Chroma database.
"""
# # If overwrite flag is true, remove old databases from directory if they exist
# if overwrite:
# if os.path.exists(self.database_directory):
# shutil.rmtree(self.database_directory)
# time.sleep(1)

# PDF is the default loader defined above

if os.path.exists(self.database_directory):
raise AssertionError("Delete old database first and restart session!")

# Define text_splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

if self.loader == "pdf":
# loader = PyPDFDirectoryLoader(self.data_path)
# get file_paths of all pdfs in data_folder
pdf_paths = glob.glob(os.path.join(self.data_path, "*.pdf"))

splits = []
for pdf_path in pdf_paths:
file_name = os.path.basename(pdf_path)
party = file_name.split("_")[0]

# Load pdf as single doc
loader = PyMuPDFLoader(pdf_path, mode="single")
doc = loader.load()

# Also load pdf as individual pages, this is important to extract the page number later
loader = PyMuPDFLoader(pdf_path, mode="page")
doc_pages = loader.load()

# Add party to metadata
for i in range(len(doc)):
doc[i].metadata.update({"party": party})

# Create splits
splits_temp = text_splitter.split_documents(doc)

# For each split, we search for the page on which it has occurred
for split in splits_temp:
for page_number, doc_page in enumerate(doc_pages):
# Create first and second half of split
split_1 = split.page_content[: int(0.5 * len(split.page_content))]
split_2 = split.page_content[int(0.5 * len(split.page_content)) :]
# If the first half is on page page_number or the second half is on page page_number, set page=page_number
if split_1 in doc_page.page_content or split_2 in doc_page.page_content:
split.metadata.update({"page": page_number})

if split.metadata.get("page") is None:
split.metadata.update({"page": 1})

splits.extend(splits_temp)

elif self.loader == "csv":
loader = CSVLoader(self.data_path, metadata_columns=["date", "fullName", "politicalGroup", "party"])
# Load documents
docs = loader.load()

# Create splits
splits = text_splitter.split_documents(docs)

# Create database
self.database = Chroma.from_documents(
splits,
self.embedding_model,
persist_directory=self.database_directory,
collection_metadata={"hnsw:space": "cosine"},
)

return self.database


if __name__ == "__main__":
Expand Down
8 changes: 2 additions & 6 deletions RAG/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def context_relevancy(self, dataset):

context = ""
for i, doc in enumerate(context_docs):
context += f"Dokument {i+1}: {doc}\n\n"
context += f"Dokument {i + 1}: {doc}\n\n"

prompt = f"""
{instruction}
Expand All @@ -29,11 +29,7 @@ def context_relevancy(self, dataset):
{question}"""

completion = self.client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{"role": "user", "content": prompt},
],
model="gpt-3.5-turbo", temperature=0, messages=[{"role": "user", "content": prompt}]
)
# Parse output into list
try:
Expand Down
Loading

0 comments on commit df77ba6

Please sign in to comment.