-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integration of structured query and database filtering
- Loading branch information
1 parent
f2b032d
commit 44c30c9
Showing
12 changed files
with
1,924 additions
and
244 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Numeric"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}] | ||
[{"name": "Unnamed: 0", "description": "Unique identifier", "type": "Numeric"}, {"name": "did", "description": "Dataset ID", "type": "Numeric"}, {"name": "name", "description": "Name of the dataset", "type": "String"}, {"name": "version", "description": "Version of this dataset. '1' for original version. Auto-incremented by server.", "type": "Numeric"}, {"name": "uploader", "description": "ID of the uploader", "type": "Numeric"}, {"name": "status", "description": "Current status of the dataset. Whether the dataset is active.", "type": "String"}, {"name": "format", "description": "Format of the dataset. Example - arff format ", "type": "String"}, {"name": "MajorityClassSize", "description": "Number of instances belonging to the most frequent class.", "type": "Numeric"}, {"name": "MaxNominalAttDistinctValues", "description": "The maximum number of distinct values among attributes of the nominal type.", "type": "Numeric"}, {"name": "MinorityClassSize", "description": "Number of instances belonging to the least frequent class.", "type": "Numeric"}, {"name": "NumberOfClasses", "description": "Number of classes in the dataset. 2.0 for binary classification, and more than 2.0 for multi-class classification.", "type": "Float"}, {"name": "NumberOfFeatures", "description": "Number of features or attributes in the dataset.", "type": "Numeric"}, {"name": "NumberOfInstances", "description": "Number of instances in the dataset", "type": "Numeric"}, {"name": "NumberOfInstancesWithMissingValues", "description": "Number of instances with missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfMissingValues", "description": "Number of missing values in the dataset", "type": "Numeric"}, {"name": "NumberOfNumericFeatures", "description": "Number of numeric features in the dataset", "type": "Numeric"}, {"name": "NumberOfSymbolicFeatures", "description": "Number of symbolic features in the dataset", "type": "Numeric"}, {"name": "description", "description": "Description of the dataset", "type": "String"}, {"name": "qualities", "description": "Qualities of the dataset", "type": "String"}, {"name": "features", "description": "Features of the dataset", "type": "String"}, {"name": "Combined_information", "description": "Combine information from all the coulmns in the dataset.", "type": "String"}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/bin/bash | ||
poetry install | ||
killall ollama | ||
killall streamlit | ||
# Define a file to store the PIDs | ||
PID_FILE="processes.pid" | ||
|
||
# Start processes and save their PIDs | ||
cd ollama | ||
./get_ollama.sh & | ||
echo $! > $PID_FILE | ||
|
||
cd ../structured_query | ||
uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 & | ||
echo $! > $PID_FILE | ||
|
||
cd .. | ||
# Keep the script running to maintain the background processes | ||
wait |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import sqlalchemy | ||
import pandas as pd | ||
import chromadb | ||
from langchain_community.vectorstores.chroma import Chroma | ||
from tqdm.auto import tqdm | ||
|
||
import sys | ||
|
||
sys.path.append("../") | ||
sys.path.append("../backend/") | ||
from backend.modules.utils import load_config_and_device | ||
|
||
config = load_config_and_device("../backend/config.json") | ||
|
||
# load the persistent database using ChromaDB | ||
client = chromadb.PersistentClient(path=config["chroma_metadata_dir"]) | ||
|
||
collec = client.get_or_create_collection(name = "metadata") | ||
|
||
metadata_df = pd.read_csv("../data/all_dataset_description.csv") | ||
metadata_df = metadata_df.drop(columns=["Combined_information"]) | ||
|
||
# Function to chunk the DataFrame | ||
def chunk_dataframe(df, chunk_size): | ||
for i in range(0, df.shape[0], chunk_size): | ||
yield df.iloc[i : i + chunk_size] | ||
|
||
def load_chroma_metadata(): | ||
# Define the chunk size | ||
chunk_size = 100 # Adjust the chunk size as needed | ||
|
||
# Process each chunk | ||
for chunk in tqdm( | ||
chunk_dataframe(metadata_df, chunk_size), total=(len(metadata_df) // chunk_size) + 1 | ||
): | ||
ids = chunk["did"].astype(str).tolist() | ||
documents = chunk["description"].astype(str).tolist() | ||
metadatas = chunk.to_dict(orient="records") | ||
|
||
# Add to ChromaDB collection | ||
if collec.get(ids=ids) == []: | ||
collec.add(ids=ids, documents=documents, metadatas=metadatas) | ||
|
||
return collec |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.