Merge branch 'main' of https://github.com/openml-labs/ai_search

openml-labs · Aug 9, 2024 · 4f2a095 · 4f2a095
2 parents b00a944 + 1218cc9
commit 4f2a095
Show file tree

Hide file tree

Showing 17 changed files with 95 additions and 47 deletions.
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,12 +1,13 @@
 FROM python:3.10.14
 RUN mkdir /fastapi
-COPY requirements.txt fastapi/requirements.txt
+RUN mkdir /data
+COPY ./backend /fastapi
+COPY ./data /data
 WORKDIR /fastapi
 
 RUN pip install -r requirements.txt
 
-COPY ./data /fastapi
-COPY . /fastapi
+#COPY . /fastapi
 # COPY modules /fastapi
 # COPY *.py /fastapi
 # COPY *.json /fastapi

diff --git a/backend/backend.py b/backend/backend.py
@@ -16,11 +16,12 @@
     config["test_subset"] = True
     config["data_dir"] = "./data/testing_data/"
 # load the persistent database using ChromaDB
+print('Loading DB')
 client = chromadb.PersistentClient(path=config["persist_dir"])
 # Loading the metadata for all types
 
 # Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
-
+print('Setting LLM chain')
 qa_dataset_handler = QASetup(
     config=config,
     data_type="dataset",
@@ -41,7 +42,7 @@
 llm_chain_handler = LLMChainCreator(config=config, local=True)
 llm_chain_handler.enable_cache()
 llm_chain = llm_chain_handler.get_llm_chain()
-
+print('OK.')
 
 # Send test query as first query to avoid cold start
 try:
@@ -65,8 +66,8 @@ async def read_dataset(query: str):
         # Fetch the result data frame based on the query
         _, ids_order = QueryProcessor(
             query=query,
-            qa=qa_dataset if type_of_query == "dataset" else qa_flow,
-            type_of_query=type_of_query,
+            qa=qa_dataset,
+            type_of_query='dataset',
             config=config,
         ).get_result_from_query()
 
@@ -86,8 +87,8 @@ async def read_flow(query: str):
     try:
         _, ids_order = QueryProcessor(
             query=query,
-            qa=qa_dataset if type_of_query == "flow" else qa_flow,
-            type_of_query=type_of_query,
+            qa=qa_flow,
+            type_of_query='flow',
             config=config,
         ).get_result_from_query()
 

diff --git a/backend/training.py b/backend/training.py
@@ -1,6 +1,6 @@
 import chromadb
-from .modules.rag_llm import *
-from .modules.utils import *
+from modules.rag_llm import *
+from modules.utils import *
 
 # Load the config file and set training to true
 config = load_config_and_device("config.json", training=True)

diff --git a/data/attribute_info.json b/data/attribute_info.json
@@ -1 +1 @@
-[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Float"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
+[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Numeric"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,6 +1,6 @@
 services:
   ollama:
-    # build: ollama/
+#    build: ollama/
     image: ollama/ollama:latest
     ports: 
       - 11434:11434
@@ -9,27 +9,41 @@ services:
     # restart: always
     # tty: true
     container_name: ollama
-    restart: unless-stopped
-    # volumes:
-      # - ./get_ollama.sh:/get_ollama.sh
-    # entrypoint: ["/bin/bash", "/get_ollama.sh"]
+#    restart: unless-stopped
+    volumes:
+      - ./ollama/get_ollama.sh:/get_ollama.sh
+    entrypoint: ["/bin/bash", "/get_ollama.sh"]
 
   fastapi:
-    build: backend/
-    ports: 
+    build:
+      context: .
+      dockerfile: backend/Dockerfile
+    ports:
       - 8000:8000
     networks:
       - deploy_network
     depends_on:
       - ollama
     container_name: fastapi
 
+  llmservice:
+    build: llm_service/
+    ports:
+      - 8081:8081
+    networks:
+      - deploy_network
+    depends_on:
+      - ollama
+    container_name: llmservice
+
   streamlit:
-    build: frontend/
+    build:
+      context: .
+      dockerfile: frontend/Dockerfile
     depends_on:
       - fastapi
       - ollama
-    ports: 
+    ports:
         - 8501:8501
     networks:
       - deploy_network

diff --git a/frontend/Dockerfile b/frontend/Dockerfile
@@ -1,12 +1,17 @@
 FROM python:3.10.14
 
 RUN mkdir /streamlit
+RUN mkdir /backend
+RUN mkdir /data
 
-COPY requirements.txt /streamlit
+COPY ./data/all_dataset_description.csv /data
+COPY ./data/all_flow_description.csv /data
+COPY ./frontend /streamlit
+COPY ./backend/config.json /backend
 WORKDIR /streamlit
 
 RUN pip install -r requirements.txt
-COPY . /streamlit
+#COPY . /streamlit
 
 EXPOSE 8501
 

diff --git a/frontend/paths.json b/frontend/paths.json
@@ -4,7 +4,7 @@
         "local" : "http://0.0.0.0:8000/"
     },
     "llm_response" : {
-        "docker" : "http://fastapi:8081/llmquery/",
+        "docker" : "http://llmservice:8081/llmquery/",
         "local" : "http://0.0.0.0:8081/llmquery/"
     },
     "structured_query": {

diff --git a/frontend/requirements.txt b/frontend/requirements.txt
@@ -1 +1,3 @@
-streamlit==1.36.0
+streamlit==1.36.0
+streamlit_feedback
+langchain_community
diff --git a/frontend/ui.py b/frontend/ui.py
@@ -33,9 +33,11 @@
 st.session_state["query"] = query
 st.session_state["query_type"] = query_type
 
+llm_filter = st.checkbox('LLM Filter')
 # Submit button logic
 if st.button("Submit"):
-    response_parser = ResponseParser(query_type, apply_llm_before_rag=False)
+    apply_llm_before_rag=None if not llm_filter else False
+    response_parser = ResponseParser(query_type, apply_llm_before_rag=apply_llm_before_rag)
     if query_type == "Dataset":
         with st.spinner("Waiting for results..."):
             if config["structured_query"] == True:
@@ -59,8 +61,9 @@
             else:
                 # get rag response
                 response_parser.fetch_rag_response(query_type, query)
-                # get llm response
-                response_parser.fetch_llm_response(query)
+                if llm_filter:
+                    # get llm response
+                    response_parser.fetch_llm_response(query)
                 # get updated columns based on llm response
 
             results = response_parser.parse_and_update_response(data_metadata)

diff --git a/frontend/ui_utils.py b/frontend/ui_utils.py
@@ -205,8 +205,28 @@ def parse_and_update_response(self, metadata: pd.DataFrame):
         -  self.apply_llm_before_rag == False
              - Metadata is filtered based by the Query parsing LLM first and the rag response second
         """
-        if self.rag_response is not None and self.llm_response is not None:
-            if not self.apply_llm_before_rag:
+        if self.apply_llm_before_rag is None or self.llm_response is None:
+            print('No LLM filter.')
+            print(self.rag_response)
+            filtered_metadata = metadata[
+                metadata["did"].isin(self.rag_response["initial_response"])
+            ]
+            print(filtered_metadata)
+            # if no llm response is required, return the initial response
+            return filtered_metadata
+
+        elif self.rag_response is not None and self.llm_response is not None:
+            if self.apply_llm_before_rag is None:
+                print('No LLM filter.')
+                print(self.rag_response)
+                filtered_metadata = metadata[
+                    metadata["did"].isin(self.rag_response["initial_response"])
+                ]
+                print(filtered_metadata)
+                # if no llm response is required, return the initial response
+                return filtered_metadata
+            elif not self.apply_llm_before_rag:
+                print('RAG before LLM filter.')
                 filtered_metadata = metadata[
                     metadata["did"].isin(self.rag_response["initial_response"])
                 ]
@@ -216,6 +236,7 @@ def parse_and_update_response(self, metadata: pd.DataFrame):
                     llm_parser.get_attributes_from_response()
                     return llm_parser.update_subset_cols(filtered_metadata)
             elif self.apply_llm_before_rag:
+                print('LLM filter before RAG')
                 llm_parser = LLMResponseParser(self.llm_response)
                 llm_parser.get_attributes_from_response()
                 filtered_metadata = llm_parser.update_subset_cols(metadata)
@@ -224,9 +245,6 @@ def parse_and_update_response(self, metadata: pd.DataFrame):
                     filtered_metadata["did"].isin(self.rag_response["initial_response"])
                 ]
 
-            elif self.apply_llm_before_rag is None:
-                # if no llm response is required, return the initial response
-                return metadata
         elif (
             self.rag_response is not None and self.structured_query_response is not None
         ):  
@@ -242,7 +260,5 @@ def parse_and_update_response(self, metadata: pd.DataFrame):
                     ]
                 print("Showing only rag response")
             return filtered_metadata[["did", "name", *col_name]]
-        else:
-            return metadata
 
 
diff --git a/llm_service/Dockerfile b/llm_service/Dockerfile
@@ -1,12 +1,11 @@
 FROM python:3.10.14
 RUN mkdir /llmservice
-COPY requirements.txt fastapi/requirements.txt
-WORKDIR /fastapi
+COPY . /llmservice
+WORKDIR /llmservice
 
 RUN pip install -r requirements.txt
 
-COPY . /llmservice
 
-EXPOSE 8001
+EXPOSE 8081
 
-ENTRYPOINT ["uvicorn", "llmservice:app", "--host", "0.0.0.0", "--port", "8001"]
+ENTRYPOINT ["uvicorn", "llm_service:app", "--host", "0.0.0.0", "--port", "8081"]
diff --git a/llm_service/llm_service.py b/llm_service/llm_service.py
@@ -25,8 +25,9 @@
 # join the prompt dictionary to the prompt template to create the final prompt
 prompt = prompt_template + "\n".join([prompt_dict[key] for key in prompt_dict.keys()])
 
-chain = create_chain(prompt)
 
+chain = create_chain(prompt)
+chain_docker = create_chain(prompt, base_url='http://ollama:11434')
 app = FastAPI()
 
 
@@ -38,6 +39,9 @@ async def get_llm_query(query: str):
     Description: Get the query, replace %20 (url spacing) with space and invoke the chain to get the answers based on the prompt
     """
     query = query.replace("%20", " ")
-    response = chain.invoke({"query": query})
+    try:
+        response = chain_docker.invoke({"query": query})
+    except:
+        response = chain.invoke({"query": query})
     answers = parse_answers_initial(response, patterns, prompt_dict)
     return JSONResponse(content=answers)
diff --git a/llm_service/llm_service_utils.py b/llm_service/llm_service_utils.py
@@ -4,12 +4,12 @@
 from langchain_core.prompts import ChatPromptTemplate
 
 
-def create_chain(prompt, model: str = "llama3", temperature: int = 0):
+def create_chain(prompt, model: str = "llama3", temperature: int = 0, base_url: str = "http://localhost:11434"):
     """
     Description: Create a langchain chain with the given prompt and model and the temperature.
     The lower the temperature, the less "creative" the model will be.
     """
-    llm = ChatOllama(model=model, temperature=temperature)
+    llm = ChatOllama(model=model, temperature=temperature, base_url=base_url)
     prompt = ChatPromptTemplate.from_template(prompt)
 
     return prompt | llm | StrOutputParser()

diff --git a/llm_service/requirements.txt b/llm_service/requirements.txt
@@ -6,3 +6,4 @@ langchain==0.2.6
 langchain_community==0.2.6
 langchain_core==0.2.11
 tenacity==8.3.0
+regex
diff --git a/ollama/Dockerfile b/ollama/Dockerfile
@@ -7,12 +7,12 @@ RUN mkdir /ollama
 
 # COPY requirements.txt /ollama
 COPY get_ollama.sh /ollama
-COPY base_logger.py /ollama
+#COPY base_logger.py /ollama
 WORKDIR /ollama
 
 RUN curl -fsSL https://ollama.com/install.sh | sh 
 RUN chmod +x get_ollama.sh 
-RUN python3 base_logger.py
+#RUN python3 base_logger.py
 EXPOSE 11434
 
 ENTRYPOINT ["ollama", "serve"]
diff --git a/ollama/get_ollama.sh b/ollama/get_ollama.sh
@@ -6,4 +6,6 @@ while [ "$(ollama list | grep 'NAME')" == "" ]; do
   sleep 1
 done
 
+
 ollama run llama3
+tail -f /dev/null
diff --git a/start_training.sh b/start_training.sh
@@ -1,3 +1,3 @@
-cd ai_search/backend
+cd backend
 python training.py
 cd ..
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Float"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
		[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Numeric"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}]
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,6 @@ while [ "$(ollama list | grep 'NAME')" == "" ]; do @@
       sleep 1
     done
     ollama run llama3
+    tail -f /dev/null