Integration of structured query and database filtering

openml-labs · Jul 25, 2024 · 44c30c9 · 44c30c9
1 parent f2b032d
commit 44c30c9
Show file tree

Hide file tree

Showing 12 changed files with 1,924 additions and 244 deletions.
diff --git a/backend/config.json b/backend/config.json
@@ -17,6 +17,7 @@
     "search_type" : "similarity",
     "reranking" : false,
     "long_context_reorder" : false,
-    "structure_query": false,
-    "use_chroma_for_saving_metadata": false
+    "structured_query": false,
+    "use_chroma_for_saving_metadata": false,
+    "chroma_metadata_dir": "../data/chroma_db_metadata"
 }
diff --git a/data/attribute_info.json b/data/attribute_info.json
@@ -1 +1 @@
-[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Numeric"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
+[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Float"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
diff --git a/frontend/ui.py b/frontend/ui.py
@@ -9,6 +9,13 @@
 with open("../backend/config.json", "r") as file:
     config = json.load(file)
 
+# Load metadata chroma database
+if config['structure_query']:
+    import sys
+    sys.path.append('../')
+    from structured_query.chroma_store_utilis import *
+    collec = load_chroma_metadata() 
+
 # Metadata paths
 data_metadata_path = Path(config["data_dir"]) / "all_dataset_description.csv"
 flow_metadata_path = Path(config["data_dir"]) / "all_flow_description.csv"
@@ -26,28 +33,30 @@
 st.session_state["query"] = query
 st.session_state["query_type"] = query_type
 
-
 # Submit button logic
 if st.button("Submit"):
-    response_parser = ResponseParser(query_type, apply_llm_before_rag=True)
+    response_parser = ResponseParser(query_type, apply_llm_before_rag=False)
     if query_type == "Dataset":
         with st.spinner("Waiting for results..."):
             if config["structure_query"] == True:
                 # get structured query
-                structured_query = response_parser.fetch_structured_query(
+                response_parser.fetch_structured_query(
                     query_type, query
                 )
-                st.write(structured_query)
+                st.write(response_parser.structured_query_response[0])
                 # get rag response
                 response_parser.fetch_rag_response(
-                    query_type, structured_query["query"]
+                    query_type, response_parser.structured_query_response[0]["query"]
                 )
+                if response_parser.structured_query_response[1].get("filter"):
+                    response_parser.database_filter(response_parser.structured_query_response[1]["filter"], collec)
             else:
                 # get rag response
                 response_parser.fetch_rag_response(query_type, query)
                 # get llm response
                 response_parser.fetch_llm_response(query)
                 # get updated columns based on llm response
+
             results = response_parser.parse_and_update_response(data_metadata)
             # display results in a table
             display_results(results)

diff --git a/frontend/ui_utils.py b/frontend/ui_utils.py
@@ -4,6 +4,7 @@
 import requests
 import streamlit as st
 from streamlit import session_state as ss
+from langchain_community.query_constructors.chroma import ChromaTranslator
 
 
 def feedback_cb():
@@ -108,6 +109,8 @@ def __init__(self, query_type, apply_llm_before_rag=False):
         self.rag_response = None
         self.llm_response = None
         self.apply_llm_before_rag = apply_llm_before_rag
+        self.database_filtered = None
+        self.structured_query_response = None
 
     def load_paths(self):
         """
@@ -148,9 +151,19 @@ def fetch_structured_query(self, query_type, query):
                 f"{structured_response_path['local']}{query}",
                 json={"query": query},
             ).json()
-        print(self.structured_query_response)
+
         return self.structured_query_response
-
+
+    def database_filter(self, filter_condition, collec):
+        """
+        Apply database filter on the rag_response
+        """
+        ids = list(map(str, self.rag_response['initial_response']))
+        self.database_filtered = collec.get(ids = ids, where=filter_condition)['ids']
+        self.database_filtered = list(map(int, self.database_filtered))
+        # print(self.database_filtered)
+        return self.database_filtered
+
     def fetch_rag_response(self, query_type, query):
         """
         Description: Fetch the response from the FastAPI service
@@ -198,7 +211,20 @@ def parse_and_update_response(self, metadata):
                 return metadata
         elif (
             self.rag_response is not None and self.structured_query_response is not None
-        ):
-            return metadata[["did", "name"]]
+        ):  
+            col_name = ["status", "NumberOfClasses", "NumberOfFeatures", "NumberOfInstances"]
+            if self.structured_query_response[0].get("filter"):
+                filtered_metadata = metadata[
+                        metadata["did"].isin(self.database_filtered)
+                    ]
+                print("Showing database filtered data")
+            else:
+                filtered_metadata = metadata[
+                        metadata["did"].isin(self.rag_response['initial_response'])
+                    ]
+                print("Showing only rag response")
+            return filtered_metadata[["did", "name", *col_name]]
         else:
             return metadata
+
+
diff --git a/start_local.sh b/start_local.sh
@@ -10,7 +10,10 @@ cd ollama || exit
 ./get_ollama.sh &
 echo $! > $PID_FILE
 
-structured_query = false
+# Fetch configuration from ../backend/config.json
+config_file="../backend/config.json"
+structured_query=$(jq -r '.structured_query' $config_file)
+
 if [ "$structured_query" == true ]; then
     cd ../structured_query || exit
     uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 &

diff --git a/start_structured_query_service.sh b/start_structured_query_service.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+poetry install
+killall ollama
+killall streamlit
+# Define a file to store the PIDs
+PID_FILE="processes.pid"
+
+# Start processes and save their PIDs
+cd ollama
+./get_ollama.sh &
+echo $! > $PID_FILE
+
+cd ../structured_query
+uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 &
+echo $! > $PID_FILE
+
+cd ..
+# Keep the script running to maintain the background processes
+wait
diff --git a/structured_query/__init__.py b/structured_query/__init__.py
diff --git a/structured_query/chroma_store_utilis.py b/structured_query/chroma_store_utilis.py
@@ -0,0 +1,44 @@
+import sqlalchemy
+import pandas as pd
+import chromadb
+from langchain_community.vectorstores.chroma import Chroma
+from tqdm.auto import tqdm
+
+import sys
+
+sys.path.append("../")
+sys.path.append("../backend/")
+from backend.modules.utils import load_config_and_device
+
+config = load_config_and_device("../backend/config.json")
+
+# load the persistent database using ChromaDB
+client = chromadb.PersistentClient(path=config["chroma_metadata_dir"])
+
+collec = client.get_or_create_collection(name = "metadata")
+
+metadata_df = pd.read_csv("../data/all_dataset_description.csv")
+metadata_df = metadata_df.drop(columns=["Combined_information"])
+
+# Function to chunk the DataFrame
+def chunk_dataframe(df, chunk_size):
+    for i in range(0, df.shape[0], chunk_size):
+        yield df.iloc[i : i + chunk_size]
+
+def load_chroma_metadata():        
+    # Define the chunk size
+    chunk_size = 100  # Adjust the chunk size as needed
+
+    # Process each chunk
+    for chunk in tqdm(
+        chunk_dataframe(metadata_df, chunk_size), total=(len(metadata_df) // chunk_size) + 1
+    ):
+        ids = chunk["did"].astype(str).tolist()
+        documents = chunk["description"].astype(str).tolist()
+        metadatas = chunk.to_dict(orient="records")
+
+        # Add to ChromaDB collection
+        if collec.get(ids=ids) == []:
+            collec.add(ids=ids, documents=documents, metadatas=metadatas)
+
+    return collec
diff --git a/structured_query/llm_service_structured_query.py b/structured_query/llm_service_structured_query.py
@@ -8,7 +8,7 @@
 sys.path.append("../")
 sys.path.append("../data")
 
-with open("attribute_info.json", "r") as f:
+with open("../data/attribute_info.json", "r") as f:
     attribute_info = json.loads(f.read())
 
 attribute_info = attribute_info[1:]
@@ -52,19 +52,22 @@
 from httpx import ConnectTimeout
 # from llm_service_utils import create_chain, parse_answers_initial
 from tenacity import retry, retry_if_exception_type, stop_after_attempt
+from langchain_community.query_constructors.chroma import ChromaTranslator
 
 app = FastAPI()
 
-
+print("[INFO] Starting structured query service.")
 # Create port
 @app.get("/structuredquery/{query}", response_class=JSONResponse)
-@retry(stop=stop_after_attempt(3), retry=retry_if_exception_type(ConnectTimeout))
+@retry(stop=stop_after_attempt(1), retry=retry_if_exception_type(ConnectTimeout))
 async def get_structured_query(query: str):
     """
-    Description: Get the query, replace %20 with space and invoke the chain to get the answers based on the prompt
-
+    Description: Get the query, replace %20 with space and invoke the chain to get the answers based on the prompt.
 
     """
     query = query.replace("%20", " ")
     response = chain.invoke({"query": query})
-    return response
+    obj = ChromaTranslator()
+    filter_condition = obj.visit_structured_query(structured_query=response)[1]
+
+    return response, filter_condition
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Numeric"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
		[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Float"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]