Skip to content

Commit

Permalink
Integration of structured query and database filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
Taniya-Das committed Jul 25, 2024
1 parent f2b032d commit 44c30c9
Show file tree
Hide file tree
Showing 12 changed files with 1,924 additions and 244 deletions.
5 changes: 3 additions & 2 deletions backend/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"search_type" : "similarity",
"reranking" : false,
"long_context_reorder" : false,
"structure_query": false,
"use_chroma_for_saving_metadata": false
"structured_query": false,
"use_chroma_for_saving_metadata": false,
"chroma_metadata_dir": "../data/chroma_db_metadata"
}
2 changes: 1 addition & 1 deletion data/attribute_info.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Numeric"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Float"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
19 changes: 14 additions & 5 deletions frontend/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
with open("../backend/config.json", "r") as file:
config = json.load(file)

# Load metadata chroma database
if config['structure_query']:
import sys
sys.path.append('../')
from structured_query.chroma_store_utilis import *
collec = load_chroma_metadata()

# Metadata paths
data_metadata_path = Path(config["data_dir"]) / "all_dataset_description.csv"
flow_metadata_path = Path(config["data_dir"]) / "all_flow_description.csv"
Expand All @@ -26,28 +33,30 @@
st.session_state["query"] = query
st.session_state["query_type"] = query_type


# Submit button logic
if st.button("Submit"):
response_parser = ResponseParser(query_type, apply_llm_before_rag=True)
response_parser = ResponseParser(query_type, apply_llm_before_rag=False)
if query_type == "Dataset":
with st.spinner("Waiting for results..."):
if config["structure_query"] == True:
# get structured query
structured_query = response_parser.fetch_structured_query(
response_parser.fetch_structured_query(
query_type, query
)
st.write(structured_query)
st.write(response_parser.structured_query_response[0])
# get rag response
response_parser.fetch_rag_response(
query_type, structured_query["query"]
query_type, response_parser.structured_query_response[0]["query"]
)
if response_parser.structured_query_response[1].get("filter"):
response_parser.database_filter(response_parser.structured_query_response[1]["filter"], collec)
else:
# get rag response
response_parser.fetch_rag_response(query_type, query)
# get llm response
response_parser.fetch_llm_response(query)
# get updated columns based on llm response

results = response_parser.parse_and_update_response(data_metadata)
# display results in a table
display_results(results)
Expand Down
34 changes: 30 additions & 4 deletions frontend/ui_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests
import streamlit as st
from streamlit import session_state as ss
from langchain_community.query_constructors.chroma import ChromaTranslator


def feedback_cb():
Expand Down Expand Up @@ -108,6 +109,8 @@ def __init__(self, query_type, apply_llm_before_rag=False):
self.rag_response = None
self.llm_response = None
self.apply_llm_before_rag = apply_llm_before_rag
self.database_filtered = None
self.structured_query_response = None

def load_paths(self):
"""
Expand Down Expand Up @@ -148,9 +151,19 @@ def fetch_structured_query(self, query_type, query):
f"{structured_response_path['local']}{query}",
json={"query": query},
).json()
print(self.structured_query_response)

return self.structured_query_response


def database_filter(self, filter_condition, collec):
"""
Apply database filter on the rag_response
"""
ids = list(map(str, self.rag_response['initial_response']))
self.database_filtered = collec.get(ids = ids, where=filter_condition)['ids']
self.database_filtered = list(map(int, self.database_filtered))
# print(self.database_filtered)
return self.database_filtered

def fetch_rag_response(self, query_type, query):
"""
Description: Fetch the response from the FastAPI service
Expand Down Expand Up @@ -198,7 +211,20 @@ def parse_and_update_response(self, metadata):
return metadata
elif (
self.rag_response is not None and self.structured_query_response is not None
):
return metadata[["did", "name"]]
):
col_name = ["status", "NumberOfClasses", "NumberOfFeatures", "NumberOfInstances"]
if self.structured_query_response[0].get("filter"):
filtered_metadata = metadata[
metadata["did"].isin(self.database_filtered)
]
print("Showing database filtered data")
else:
filtered_metadata = metadata[
metadata["did"].isin(self.rag_response['initial_response'])
]
print("Showing only rag response")
return filtered_metadata[["did", "name", *col_name]]
else:
return metadata


5 changes: 4 additions & 1 deletion start_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ cd ollama || exit
./get_ollama.sh &
echo $! > $PID_FILE

structured_query = false
# Fetch configuration from ../backend/config.json
config_file="../backend/config.json"
structured_query=$(jq -r '.structured_query' $config_file)

if [ "$structured_query" == true ]; then
cd ../structured_query || exit
uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 &
Expand Down
19 changes: 19 additions & 0 deletions start_structured_query_service.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
poetry install
killall ollama
killall streamlit
# Define a file to store the PIDs
PID_FILE="processes.pid"

# Start processes and save their PIDs
cd ollama
./get_ollama.sh &
echo $! > $PID_FILE

cd ../structured_query
uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 &
echo $! > $PID_FILE

cd ..
# Keep the script running to maintain the background processes
wait
Empty file added structured_query/__init__.py
Empty file.
44 changes: 44 additions & 0 deletions structured_query/chroma_store_utilis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sqlalchemy
import pandas as pd
import chromadb
from langchain_community.vectorstores.chroma import Chroma
from tqdm.auto import tqdm

import sys

sys.path.append("../")
sys.path.append("../backend/")
from backend.modules.utils import load_config_and_device

config = load_config_and_device("../backend/config.json")

# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["chroma_metadata_dir"])

collec = client.get_or_create_collection(name = "metadata")

metadata_df = pd.read_csv("../data/all_dataset_description.csv")
metadata_df = metadata_df.drop(columns=["Combined_information"])

# Function to chunk the DataFrame
def chunk_dataframe(df, chunk_size):
for i in range(0, df.shape[0], chunk_size):
yield df.iloc[i : i + chunk_size]

def load_chroma_metadata():
# Define the chunk size
chunk_size = 100 # Adjust the chunk size as needed

# Process each chunk
for chunk in tqdm(
chunk_dataframe(metadata_df, chunk_size), total=(len(metadata_df) // chunk_size) + 1
):
ids = chunk["did"].astype(str).tolist()
documents = chunk["description"].astype(str).tolist()
metadatas = chunk.to_dict(orient="records")

# Add to ChromaDB collection
if collec.get(ids=ids) == []:
collec.add(ids=ids, documents=documents, metadatas=metadatas)

return collec
15 changes: 9 additions & 6 deletions structured_query/llm_service_structured_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
sys.path.append("../")
sys.path.append("../data")

with open("attribute_info.json", "r") as f:
with open("../data/attribute_info.json", "r") as f:
attribute_info = json.loads(f.read())

attribute_info = attribute_info[1:]
Expand Down Expand Up @@ -52,19 +52,22 @@
from httpx import ConnectTimeout
# from llm_service_utils import create_chain, parse_answers_initial
from tenacity import retry, retry_if_exception_type, stop_after_attempt
from langchain_community.query_constructors.chroma import ChromaTranslator

app = FastAPI()


print("[INFO] Starting structured query service.")
# Create port
@app.get("/structuredquery/{query}", response_class=JSONResponse)
@retry(stop=stop_after_attempt(3), retry=retry_if_exception_type(ConnectTimeout))
@retry(stop=stop_after_attempt(1), retry=retry_if_exception_type(ConnectTimeout))
async def get_structured_query(query: str):
"""
Description: Get the query, replace %20 with space and invoke the chain to get the answers based on the prompt
Description: Get the query, replace %20 with space and invoke the chain to get the answers based on the prompt.
"""
query = query.replace("%20", " ")
response = chain.invoke({"query": query})
return response
obj = ChromaTranslator()
filter_condition = obj.visit_structured_query(structured_query=response)[1]

return response, filter_condition
Loading

0 comments on commit 44c30c9

Please sign in to comment.