integration of structured query only(search part) with existing app

openml-labs · Jul 19, 2024 · 9cf2b3d · 9cf2b3d
1 parent 0837b7e
commit 9cf2b3d
Show file tree

Hide file tree

Showing 10 changed files with 218 additions and 9 deletions.
diff --git a/backend/config.json b/backend/config.json
@@ -16,6 +16,7 @@
     "top_p" : 0.95,
     "search_type" : "similarity",
     "reranking" : false,
-    "long_context_reorder" : false
+    "long_context_reorder" : false,
+    "structure_query": true
 
 }
diff --git a/frontend/paths.json b/frontend/paths.json
@@ -6,5 +6,9 @@
     "llm_response" : {
         "docker" : "http://fastapi:8081/llmquery/",
         "local" : "http://0.0.0.0:8081/llmquery/"
+    },
+    "structured_query": {
+        "docker" : "http://fastapi:8082/structuredquery/",
+        "local" : "http://0.0.0.0:8082/structuredquery/"
     }
 }
diff --git a/frontend/ui.py b/frontend/ui.py
@@ -32,11 +32,21 @@
     response_parser = ResponseParser(query_type, apply_llm_before_rag=True)
     if query_type == "Dataset":
         with st.spinner("Waiting for results..."):
-            # get rag response
-            response_parser.fetch_rag_response(query_type, query)
-            # get llm response
-            response_parser.fetch_llm_response(query)
-            # get updated columns based on llm response
+            if config["structure_query"] == True:
+                # get structured query 
+                structured_query = response_parser.fetch_structured_query(query_type, query)
+                st.write(structured_query)
+                # get rag response
+                response_parser.fetch_rag_response(query_type, structured_query['query'])
+                # get llm response
+                # response_parser.fetch_llm_response(query)
+                # get updated columns based on llm response
+            else:
+                # get rag response
+                response_parser.fetch_rag_response(query_type, query)
+                # get llm response
+                response_parser.fetch_llm_response(query)
+                # get updated columns based on llm response
             results = response_parser.parse_and_update_response(data_metadata)
             # display results in a table
             display_results(results)

diff --git a/frontend/ui_utils.py b/frontend/ui_utils.py
@@ -130,6 +130,28 @@ def fetch_llm_response(self, query):
                 f"{llm_response_path['local']}{query}"
             ).json()
         return self.llm_response
+
+    def fetch_structured_query(self, query_type, query):
+        """
+        Description: Fetch the response from the FastAPI service
+
+
+        """
+        structured_response_path = self.paths["structured_query"]
+        try:
+            self.structured_query_response = requests.get(
+                f"{structured_response_path['docker']}{query}",
+                json={"query": query},
+            ).json()
+        except:
+            self.structured_query_response = requests.get(
+                f"{structured_response_path['local']}{query}",
+                json={"query": query},
+            ).json()
+        print(self.structured_query_response)
+        return self.structured_query_response
+
+
 
     def fetch_rag_response(self, query_type, query):
         """

diff --git a/start_local.sh b/start_local.sh
@@ -9,9 +9,18 @@ cd ollama
 ./get_ollama.sh &
 echo $! > $PID_FILE
 
-cd ../llm_service
-uvicorn llm_service:app --host 0.0.0.0 --port 8081 &
-echo $! > $PID_FILE
+structured_query = True
+if [$structured_query ==true ]
+then 
+    cd ../structured_query
+    uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 &
+    echo $! > $PID_FILE
+else
+    cd ../llm_service
+    uvicorn llm_service:app --host 0.0.0.0 --port 8081 &
+    echo $! > $PID_FILE
+fi
+
 
 cd ../backend
 uvicorn backend:app --host 0.0.0.0 --port 8000 &

diff --git a/stop_local.sh b/stop_local.sh
@@ -13,6 +13,9 @@ kill -9 $(cat $PID_FILE)
 cd ../llm_service
 kill -9 $(cat $PID_FILE)
 
+cd ../structured_query
+kill -9 $(cat $PID_FILE)
+
 cd ../frontend
 # streamlit run ui.py &
 kill -9 $(cat $PID_FILE)

diff --git a/structured_query/llm_service_structured_query.py b/structured_query/llm_service_structured_query.py
@@ -0,0 +1,72 @@
+from langchain.chains.query_constructor.base import (
+    get_query_constructor_prompt,
+    load_query_constructor_runnable,
+)
+from structured_query_examples import examples
+
+import json
+import sys
+
+sys.path.append("../")
+sys.path.append("../data")
+
+with open("attribute_info.json", "r") as f:
+    attribute_info = json.loads(f.read())
+
+attribute_info = attribute_info[1:]
+
+examples = examples
+document_content_description = "Metadata of datasets for various machine learning applications fetched from OpenML platform"
+prompt = get_query_constructor_prompt(
+    document_contents=document_content_description,
+    attribute_info=attribute_info,
+    examples=examples,
+)
+
+from langchain_community.chat_models import ChatOllama
+
+content_attr = [
+    "status",
+    "NumberOfClasses",
+    "NumberOfFeatures",
+    "NumberOfInstances",
+    "Combined_information",
+]
+# document_content_description = "Metadata of machine learning datasets including status (if dataset is active or not), number of classes in the dataset, number of instances (examples) in the dataset, number of features in the dataset, and Combined_information containing the combined metadata information about the dataset."
+filter_attribute_info = tuple(ai for ai in attribute_info if ai["name"] in content_attr)
+
+chain = load_query_constructor_runnable(
+    ChatOllama(model="llama3"),
+    document_content_description,
+    # attribute_info,
+    filter_attribute_info,
+    examples=examples,
+    fix_invalid=True,
+)
+
+# def structuring_query(query:str):
+#     structured_query = chain.invoke(query)
+
+#     return structured_query.query, structured_query.filter
+
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+from httpx import ConnectTimeout
+# from llm_service_utils import create_chain, parse_answers_initial
+from tenacity import retry, retry_if_exception_type, stop_after_attempt
+
+app = FastAPI()
+
+# Create port
+@app.get("/structuredquery/{query}", response_class=JSONResponse)
+@retry(stop=stop_after_attempt(3), retry=retry_if_exception_type(ConnectTimeout))
+async def get_structured_query(query: str):
+    """
+    Description: Get the query, replace %20 with space and invoke the chain to get the answers based on the prompt
+
+
+    """
+    query = query.replace("%20", " ")
+    response = chain.invoke({"query": query})
+    return response
+
diff --git a/structured_query/llm_service_structured_query_utils.py b/structured_query/llm_service_structured_query_utils.py
diff --git a/structured_query/structured_query_examples.py b/structured_query/structured_query_examples.py
@@ -0,0 +1,41 @@
+
+examples = [
+    (
+        "Give me mushroom datasets with less than 10k rows.",
+        {
+            "query": "mushroom dataset",
+            "filter": 'lt("NumberOfInstances", 10000)',
+        },
+    ),
+    (
+        "Give me datasets than can be used in healthcare with 2 or more classes",
+        {
+            "query": "heathcare dataset ",
+            "filter": 'gte("NumberOfClasses", 2.0)',
+        },
+    ),
+    (
+        "Give me datasets than can be used in healthcare, or climate applications with 2 or more classes and in arff format.",
+        {
+            "query": "heathcare dataset, climate datasets",
+            "filter": 'and(gte("NumberOfClasses", 2.0), eq("format", "arff"))',
+        },
+    ),
+    (
+        "Give me medical datasets.",
+        {
+            "query": "medical datasets",
+            "filter": "NO_FILTER",
+        },
+    ),
+    (
+        "Give me medical datasets with large number of features.",
+        {
+            "query": "medical datasets",
+            "filter": 'gte("NumberOfFeatures, 100)',
+        },
+    ),
+]
+
+
+
diff --git a/structured_query/structuring_query.py b/structured_query/structuring_query.py
@@ -0,0 +1,47 @@
+from langchain.chains.query_constructor.base import (
+    get_query_constructor_prompt,
+    load_query_constructor_runnable,
+)
+from structured_query.structured_query_examples import examples
+
+import json
+
+with open("../data/attribute_info.json", "r") as f:
+    attribute_info = json.loads(f.read())
+
+attribute_info = attribute_info[1:]
+
+examples = examples
+document_content_description = "Metadata of datasets for various machine learning applications fetched from OpenML platform"
+prompt = get_query_constructor_prompt(
+    document_contents=document_content_description,
+    attribute_info=attribute_info,
+    examples=examples,
+)
+
+from langchain_community.chat_models import ChatOllama
+
+content_attr = [
+    "status",
+    "NumberOfClasses",
+    "NumberOfFeatures",
+    "NumberOfInstances",
+    "Combined_information",
+]
+# document_content_description = "Metadata of machine learning datasets including status (if dataset is active or not), number of classes in the dataset, number of instances (examples) in the dataset, number of features in the dataset, and Combined_information containing the combined metadata information about the dataset."
+filter_attribute_info = tuple(ai for ai in attribute_info if ai["name"] in content_attr)
+
+chain = load_query_constructor_runnable(
+    ChatOllama(model="llama3"),
+    document_content_description,
+    # attribute_info,
+    filter_attribute_info,
+    examples=examples,
+    fix_invalid=True,
+)
+
+def structuring_query(query:str):
+    structured_query = chain.invoke(query)
+
+    return structured_query.query, structured_query.filter
+