From 1350cd6ae0bb6b231902e1cd6651dde57b736e24 Mon Sep 17 00:00:00 2001 From: SubhadityaMukherjee Date: Wed, 26 Jun 2024 12:11:06 +0200 Subject: [PATCH] Deployed e44979b with MkDocs version: 1.6.0 --- objects.inv | Bin 638 -> 689 bytes search/search_index.json | 2 +- testing/index.html | 359 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 360 insertions(+), 1 deletion(-) diff --git a/objects.inv b/objects.inv index 9714f6885ee94016c40315b9421ad6dd2dd783e6..4e4b49e6667895bba9db952e8971782f7a3f6eb9 100644 GIT binary patch delta 559 zcmV+~0?_^b1hEB>jeo(C+aL^v?|q8QwAXs4*S+=?dS@`m4jTbp5NWqhzXB4+*(TZ8 zPA_&M_&*CFBzkZVB#J}^<7Rk!wq7TM&5up}5uf_$jHj>PHs8OTnVQZxEi~+nf(+=A zQ)IlGgFr8}P~LT>7jgj&(O&eO?V|o3=QUC@->YQ7Q558bZhxGtQ8aX#T>(U9oVI(duK0jaGx zSQhB>K|YL^9ACO3^sdjhK;nYUKp!-O1sE%LTm;aloosPQJjUbj$?fbTgD-U?xMC2W x#^x8le<+q4ZB{FkF|zb9dUJl0O`;}8?SUqIbYTW(^kI?wu6xTF?i~r*j5su_4H*Cc delta 507 zcmV;k} zJ-VLt$>Cdqiyw;^oI^zfb+T~M!a{YitDQu*K!*+)CE;&aC9eluPId@hL5xC6mP_@g zXjM?#K~!Eg0xcD-TrdKs;Uv~w-NM_w5wO6r1nkBwJx`g}ZI!njIuIxG%<6Zc#^@i3 zTA(vNSXnbtn}2rj*;*bZ%;)$$O!&n?s+=~=0lUO4ta5)O(ZxX%ycK=`XQ>fe{>JhP zQ3V_-rHyFFzdsQ7AND*#gzsMS&4>-KNqtgM?OKlb)uX%QN{drtmH8AkgVf@ih0sGy zz}y0ob+v)T?5gVb@S;3SszToM(sV!uh>3E;&n@_B0UgK7@IUjn zvhM@*wJv2F4g?~1?|=};R^*f88DDN;bDFAsC4H<>kCoCV9uj|9CvTAqu-b{EWre<6 xDW diff --git a/search/search_index.json b/search/search_index.json index 3072f7a..a087140 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"RAG pipeline for OpenML","text":"
  • This repository contains the code for the RAG pipeline for OpenML.
"},{"location":"#getting-started","title":"Getting started","text":"
  • A docker image will be provided at a later date for easier setup
  • Clone the repository
  • Create a virtual environment and activate it
  • Install the requirements using pip install -r requirements.txt
  • Run training.py (for the first time/to update the model). This takes care of basically everything. (Refer to the training section for more details)
  • Install Ollama (https://ollama.com/) and download the models ollama run qwen2:1.5b and ollama run phi3
  • Run uvicorn backend:app to start the FastAPI server.
  • Run streamlit run main.py to start the Streamlit frontend (this uses the FastAPI server so make sure it is running)
  • Enjoy :)
"},{"location":"configuration/","title":"Configuration","text":"
  • The main config file is config.json
  • Possible options are as follows:
  • rqa_prompt_template: The template for the RAG pipeline search prompt. This is used by the model to query the database.
  • llm_prompt_template: The template for the summary generator LLM prompt.
  • num_return_documents: Number of documents to return for a query. Too high a number can lead to Out of Memory errors. (Defaults to 50)
  • embedding_model: The model to use for generating embeddings. This is used to generate embeddings for the documents as a means of comparison using the LLM's embeddings. (Defaults to BAAI/bge-large-en-v1.5)
    • Other possible tested models
      • BAAI/bge-base-en-v1.5
      • BAAI/bge-large-en-v1.5
      • WhereIsAI/UAE-Large-V1
  • llm_model: The model used for generating the result summary. (Defaults to qwen2:1.5b)
  • data_dir: The directory to store the intermediate data like tables/databases etc. (Defaults to ./data/)
  • persist_dir: The directory to store the cached data. Defaults to ./data/chroma_db/ and stores the embeddings for the documents with a unique hash. (Defaults to ./data/chroma_db/)
  • testing_flag: Enables testing mode by using subsets of the data for quick debugging. This is used to test the pipeline and is not recommended for normal use. (Defaults to False)
  • data_download_n_jobs: Number of jobs to run in parallel for downloading data. (Defaults to 20)
  • training: Whether to train the model or not. (Defaults to False) this is automatically set to True when when running the training.py script. Do NOT set this to True manually.
  • search_type : The type of vector comparison to use. (Defaults to \"similarity\")
  • reraanking: Whether to rerank the results using the FlashRank algorithm. (Defaults to False)
  • long_context_reordering: Whether to reorder the results using the Long Context Reordering algorithm. (Defaults to False)
"},{"location":"docker/","title":"Docker container","text":"
  • Still a WIP
  • Run docker compose build --progress=plain
"},{"location":"docker/#potential-errors","title":"Potential Errors","text":"
  • If you get a memory error you can run docker system prune. Please be careful with this command as it will remove all stopped containers, all dangling images, and all unused networks. So ensure you have no important data in any of the containers before running this command.
  • On docker desktop for Mac, increase memory limits to as much as your system can handle.
"},{"location":"inference/","title":"Inference","text":"
  • Run the inference using uvicorn main:app and streamlit run main.py in different processes.
"},{"location":"testing/","title":"Testing","text":""},{"location":"testing/#load-testing","title":"Load Testing","text":"
  • Load testing can be done using Locust, a load testing tool that allows you to simulate users querying the API and measure the performance of the API under load from numerous users.
  • It is possible to configure the number of users, the hatch rate, and the time to run the test for.
"},{"location":"testing/#running-the-load-test","title":"Running the load test","text":"
  • Start the FastAPI server using uvicorn main:app
  • Load testing using Locust (locust -f tests/locust_test.py --host http://127.0.0.1:8000 ) using a different terminal
"},{"location":"training/","title":"Training","text":"
  • While we are not creating a new model, we are using the existing model to create embeddings. The name might be misleading but this was chosen as an attempt to keep the naming consistent with other codebases.
  • (Perhaps we might fine tune the model in the future)
  • The training script is present in training.py. Running this script will take care of everything.
"},{"location":"training/#what-does-the-training-script-do","title":"What does the training script do?","text":"
  • Load the config file and set the necessary variables
  • If testing_flag is set to True, the script will use a subset of the data for quick debugging
  • testing_flag is set to True
  • persist_dir is set to ./data/chroma_db_testing
  • test_subset_2000 is set to True
  • data_dir is set to ./data/testing_data/
  • If testing_flag is set to False, the script will use the entire dataset
  • For all datasets in the OpenML dataset list:
  • Download the dataset
  • Create the vector dataset with computed embeddings
  • Create a vectordb retriever
  • Run some test queries
"},{"location":"modules/general_utils/","title":"General utils","text":""},{"location":"modules/general_utils/#general_utils.find_device","title":"find_device(training=False)","text":"

Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.

Input: training (bool) : Whether the pipeline is being used for training or not.

Returns: device (str) : The device to use for the pipeline.

Source code in serving/modules/general_utils.py
def find_device(training: bool = False ) -> str:\n    \"\"\"\n    Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.\n\n    Input: training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: device (str) : The device to use for the pipeline.\n    \"\"\"\n    print(\"[INFO] Finding device.\")\n    if torch.cuda.is_available():\n        return \"cuda\"\n    elif torch.backends.mps.is_available():\n        if training == False:\n            # loading metadata on mps for inference is quite slow. So disabling for now.\n            return \"cpu\"\n        return \"mps\"\n    else:\n        return \"cpu\"\n
"},{"location":"modules/general_utils/#general_utils.load_config_and_device","title":"load_config_and_device(config_file, training=False)","text":"

Description: Load the config file and find the device to use for the pipeline.

Input: config_file (str) : The path to the config file. training (bool) : Whether the pipeline is being used for training or not.

Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.

Source code in serving/modules/general_utils.py
def load_config_and_device(config_file: str, training: bool = False) -> dict:\n    \"\"\"\n    Description: Load the config file and find the device to use for the pipeline.\n\n    Input: config_file (str) : The path to the config file.\n    training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.\n    \"\"\"\n    # Check if the config file exists and load it\n    if not os.path.exists(config_file):\n        raise Exception(\"Config file does not exist.\")\n    with open(config_file, \"r\") as f:\n        config = json.load(f)\n\n    # Find device and set it in the config between cpu and cuda and mps if available\n    config[\"device\"] = find_device(training)\n    print(f\"[INFO] Device found: {config['device']}\")\n    return config\n
"},{"location":"modules/llm_module/","title":"Llm module","text":""},{"location":"modules/llm_module/#llm.add_documents_to_db","title":"add_documents_to_db(db, unique_docs, unique_ids)","text":"

Description: Add documents to the vector store in batches of 200.

Input: db (Chroma), unique_docs (list), unique_ids (list)

Returns: None

Source code in serving/modules/llm.py
def add_documents_to_db(db, unique_docs, unique_ids):\n    \"\"\"\n    Description: Add documents to the vector store in batches of 200.\n\n    Input: db (Chroma), unique_docs (list), unique_ids (list)\n\n    Returns: None\n    \"\"\"\n    bs = 512\n    if len(unique_docs) < bs:\n        db.add_documents(unique_docs, ids=unique_ids)\n    else:\n        for i in tqdm(range(0, len(unique_docs), bs)):\n            db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])\n
"},{"location":"modules/llm_module/#llm.create_vector_store","title":"create_vector_store(metadata_df, chroma_client, config, embeddings, collection_name)","text":"

Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.

Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)

Returns: db (Chroma)

Source code in serving/modules/llm.py
def create_vector_store(\n    metadata_df: pd.DataFrame, chroma_client:ClientAPI, config: dict, embeddings: HuggingFaceEmbeddings, collection_name: str \n) -> Chroma:\n    \"\"\"\n    Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.\n\n    Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: db (Chroma)\n    \"\"\"\n\n    db = Chroma(\n        client=chroma_client,\n        embedding_function=embeddings,\n        persist_directory=config[\"persist_dir\"],\n        collection_name=collection_name,\n    )\n\n    documents = load_and_process_data(\n        metadata_df, page_content_column=\"Combined_information\"\n    )\n    if config[\"testing_flag\"]:\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 2000 rows.\")\n            documents = documents[:2000]\n    unique_docs, unique_ids = generate_unique_documents(documents, db)\n\n    print(\n        f\"Number of unique documents: {len(unique_docs)} vs Total documents: {len(documents)}\"\n    )\n    if len(unique_docs) == 0:\n        print(\"No new documents to add.\")\n        return db\n    else:\n        # db.add_documents(unique_docs, ids=unique_ids)\n        add_documents_to_db(db, unique_docs, unique_ids)\n\n    return db\n
"},{"location":"modules/llm_module/#llm.generate_unique_documents","title":"generate_unique_documents(documents, db)","text":"Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.

Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist

Input: documents (list)

Returns: unique_docs (list), unique_ids (list)

Source code in serving/modules/llm.py
def generate_unique_documents(documents: list, db: Chroma) -> tuple:\n    \"\"\"\n    Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.\n        Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist\n\n    Input: documents (list)\n\n    Returns: unique_docs (list), unique_ids (list)\n    \"\"\"\n\n    # Remove duplicates based on ID (from database)\n    new_document_ids = set([str(x.metadata[\"did\"]) for x in documents])\n    print(f\"[INFO] Generating unique documents. Total documents: {len(documents)}\")\n    try:\n        old_dids = set([str(x[\"did\"]) for x in db.get()[\"metadatas\"]])\n    except KeyError:\n        old_dids = set([str(x[\"id\"]) for x in db.get()[\"metadatas\"]])\n\n    new_dids = new_document_ids - old_dids\n    documents = [x for x in documents if str(x.metadata[\"did\"]) in new_dids]\n    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS,doc.page_content)) for doc in documents]\n\n    # Remove duplicates based on document content (from new documents)\n    unique_ids = list(set(ids))\n    seen_ids = set()\n    unique_docs = [\n            doc\n            for doc, id in zip(documents, ids)\n            if id not in seen_ids and (seen_ids.add(id) or True)\n        ]\n\n    return unique_docs, unique_ids\n
"},{"location":"modules/llm_module/#llm.get_collection_name","title":"get_collection_name(config)","text":"

Description: Get the collection name based on the type of data provided in the config.

Input: config (dict)

Returns: str

Source code in serving/modules/llm.py
def get_collection_name(config: dict) -> str:\n    \"\"\"\n    Description: Get the collection name based on the type of data provided in the config.\n\n    Input: config (dict)\n\n    Returns: str\n    \"\"\"\n    return {\"dataset\": \"datasets\", \"flow\": \"flows\"}.get(\n        config[\"type_of_data\"], \"default\"\n    )\n
"},{"location":"modules/llm_module/#llm.get_llm_chain","title":"get_llm_chain(config)","text":"

Description: Get the LLM chain with the specified model and prompt template.

Input: config (dict)

Returns: LLMChain

Source code in serving/modules/llm.py
def get_llm_chain(config: dict) -> LLMChain:\n    \"\"\"\n    Description: Get the LLM chain with the specified model and prompt template.\n\n    Input: config (dict)\n\n    Returns: LLMChain\n    \"\"\"\n\n    llm = Ollama(\n        model = config[\"llm_model\"] \n    )  \n    map_template = config[\"llm_prompt_template\"]\n    map_prompt = PromptTemplate.from_template(map_template)\n    return LLMChain(llm=llm, prompt=map_prompt)\n
"},{"location":"modules/llm_module/#llm.initialize_llm_chain","title":"initialize_llm_chain(vectordb, config)","text":"

Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.

Input: vectordb (Chroma), config (dict)

Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)

Source code in serving/modules/llm.py
def initialize_llm_chain(\n    vectordb: Chroma,\n    config : dict\n) -> langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.\n\n    Input: vectordb (Chroma), config (dict)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    return vectordb.as_retriever(\n        search_type=config[\"search_type\"],\n        search_kwargs={\"k\": config[\"num_return_documents\"]},\n    )\n
"},{"location":"modules/llm_module/#llm.load_and_process_data","title":"load_and_process_data(metadata_df, page_content_column)","text":"

Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.

Input: metadata_df (pd.DataFrame), page_content_column (str)

Returns: chunked documents (list)

Source code in serving/modules/llm.py
def load_and_process_data(metadata_df: pd.DataFrame, page_content_column: str) -> list:\n    \"\"\"\n    Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.\n\n    Input: metadata_df (pd.DataFrame), page_content_column (str)\n\n    Returns: chunked documents (list)\n    \"\"\"\n    # Load data\n    loader = DataFrameLoader(metadata_df, page_content_column=page_content_column)\n    documents = loader.load()\n\n    # Split documents\n    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n    documents = text_splitter.split_documents(documents)\n\n    return documents\n
"},{"location":"modules/llm_module/#llm.load_document_and_create_vector_store","title":"load_document_and_create_vector_store(metadata_df, chroma_client, config)","text":"

Loads the documents and creates the vector store. If the training flag is set to True, the documents are added to the vector store. If the training flag is set to False, the vector store is loaded from the persist directory.

Parameters:

Name Type Description Default metadata_df DataFrame

The metadata dataframe.

required chroma_client PersistentClient

The Chroma client.

required config dict

The configuration dictionary.

required

Returns:

Name Type Description Chroma Chroma

The Chroma vector store.

Source code in serving/modules/llm.py
def load_document_and_create_vector_store(metadata_df: pd.DataFrame, chroma_client:ClientAPI , config: dict) -> Chroma:\n    \"\"\"\n    Loads the documents and creates the vector store. If the training flag is set to True,\n    the documents are added to the vector store. If the training flag is set to False,\n    the vector store is loaded from the persist directory.\n\n    Args:\n        metadata_df (pd.DataFrame): The metadata dataframe.\n        chroma_client (chromadb.PersistentClient): The Chroma client.\n        config (dict): The configuration dictionary.\n\n    Returns:\n        Chroma: The Chroma vector store.\n    \"\"\"\n    embeddings = load_model(config)\n    collection_name = get_collection_name(config)\n\n    if not config[\"training\"]:\n        return load_vector_store(chroma_client, config, embeddings, collection_name)\n\n    return create_vector_store(\n        metadata_df, chroma_client, config, embeddings, collection_name\n    )\n
"},{"location":"modules/llm_module/#llm.load_model","title":"load_model(config)","text":"

Description: Load the model using HuggingFaceEmbeddings.

Input: config (dict)

Returns: HuggingFaceEmbeddings

Source code in serving/modules/llm.py
def load_model(config: dict) -> HuggingFaceEmbeddings | None:\n    \"\"\"\n    Description: Load the model using HuggingFaceEmbeddings.\n\n    Input: config (dict)\n\n    Returns: HuggingFaceEmbeddings\n    \"\"\"\n    print(\"[INFO] Loading model...\")\n    model_kwargs = {\"device\": config[\"device\"], \"trust_remote_code\": True}\n    encode_kwargs = {\"normalize_embeddings\": True}\n    embeddings = HuggingFaceEmbeddings(\n        model_name=config[\"embedding_model\"],\n        model_kwargs=model_kwargs,\n        encode_kwargs=encode_kwargs,\n        show_progress = True,\n        # trust_remote_code=True\n    )\n    print(\"[INFO] Model loaded.\")\n    return embeddings\n
"},{"location":"modules/llm_module/#llm.load_vector_store","title":"load_vector_store(chroma_client, config, embeddings, collection_name)","text":"

Description: Load the vector store from the persist directory.

Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)

Returns: Chroma

Source code in serving/modules/llm.py
def load_vector_store(chroma_client: ClientAPI, config: dict, embeddings: HuggingFaceEmbeddings, collection_name: str) -> Chroma:\n    \"\"\"\n    Description: Load the vector store from the persist directory.\n\n    Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: Chroma\n    \"\"\"\n    if not os.path.exists(config[\"persist_dir\"]):\n        raise Exception(\n            \"Persist directory does not exist. Please run the training pipeline first.\"\n        )\n\n    return Chroma(\n        client=chroma_client,\n        persist_directory=config[\"persist_dir\"],\n        embedding_function=embeddings,\n        collection_name=collection_name,\n    )\n
"},{"location":"modules/llm_module/#llm.setup_vector_db_and_qa","title":"setup_vector_db_and_qa(config, data_type, client)","text":"

Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage. This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.

Input: config (dict), data_type (str), client (chromadb.PersistentClient)

Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)

Source code in serving/modules/llm.py
def setup_vector_db_and_qa(config: dict, data_type: str, client:ClientAPI) -> langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.\n    This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.\n\n    Input: config (dict), data_type (str), client (chromadb.PersistentClient)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    config[\"type_of_data\"] = data_type\n    # Download the data if it does not exist\n    openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(\n        config=config\n    )\n    # Create the combined metadata dataframe\n    metadata_df, all_metadata = create_metadata_dataframe(\n        openml_data_object, data_id, all_metadata, config=config\n    )\n    # Create the vector store\n    vectordb = load_document_and_create_vector_store(\n        metadata_df, config=config, chroma_client=client\n    )\n    # Initialize the LLM chain and setup Retrieval QA\n    qa = initialize_llm_chain(vectordb=vectordb, config=config)\n    return qa\n
"},{"location":"modules/metadata_module/","title":"Metadata module","text":""},{"location":"modules/metadata_module/#metadata_utils.combine_metadata","title":"combine_metadata(all_dataset_metadata, all_data_description_df)","text":"

Description: Combine the descriptions with the metadata table.

Input: all_dataset_metadata (pd.DataFrame) : The metadata table, all_data_description_df (pd.DataFrame) : The descriptions

Returns: The combined metadata table.

Source code in serving/modules/metadata_utils.py
def combine_metadata(all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"\n    Description: Combine the descriptions with the metadata table.\n\n    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,\n    all_data_description_df (pd.DataFrame) : The descriptions\n\n    Returns: The combined metadata table.\n    \"\"\"\n    # Combine the descriptions with the metadata table\n    all_dataset_metadata = pd.merge(\n        all_dataset_metadata, all_data_description_df, on=\"did\", how=\"inner\"\n    )\n\n    # Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    all_dataset_metadata[\"Combined_information\"] = all_dataset_metadata.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    return all_dataset_metadata\n
"},{"location":"modules/metadata_module/#metadata_utils.create_combined_information_df","title":"create_combined_information_df(data_id, descriptions, joined_qualities, joined_features)","text":"

Description: Create a dataframe with the combined information of the OpenML object.

Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object

Returns: The dataframe with the combined information of the OpenML object.

Source code in serving/modules/metadata_utils.py
def create_combined_information_df(\n    # data_id, descriptions, joined_qualities, joined_features\n    data_id: int| Sequence[int], descriptions: Sequence[str], joined_qualities: Sequence[str], joined_features: Sequence[str]\n) -> pd.DataFrame:\n    \"\"\"\n    Description: Create a dataframe with the combined information of the OpenML object.\n\n    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object\n\n    Returns: The dataframe with the combined information of the OpenML object.\n    \"\"\"\n    return pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"qualities\": joined_qualities,\n            \"features\": joined_features,\n        }\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.create_metadata_dataframe","title":"create_metadata_dataframe(openml_data_object, data_id, all_dataset_metadata, config)","text":"

Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config. If training is set to False, the dataframes are loaded from the files. If training is set to True, the dataframes are created and then saved to the files.

Parameters:

Name Type Description Default openml_data_object list

The list of OpenML objects.

required data_id list

The list of data ids.

required all_dataset_metadata DataFrame

The metadata table.

required config dict

The config dictionary.

required

Returns:

Type Description DataFrame

pd.DataFrame: The combined metadata dataframe.

DataFrame

pd.DataFrame: The updated metadata table.

Source code in serving/modules/metadata_utils.py
def create_metadata_dataframe(\n    # openml_data_object, data_id, all_dataset_metadata, config\n    openml_data_object: Sequence[Union[openml.datasets.dataset.OpenMLDataset, openml.flows.flow.OpenMLFlow]], data_id: Sequence[int], all_dataset_metadata: pd.DataFrame, config: dict\n) -> Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Creates a dataframe with all the metadata, joined columns with all information\n    for the type of data specified in the config. If training is set to False,\n    the dataframes are loaded from the files. If training is set to True, the\n    dataframes are created and then saved to the files.\n\n    Args:\n        openml_data_object (list): The list of OpenML objects.\n        data_id (list): The list of data ids.\n        all_dataset_metadata (pd.DataFrame): The metadata table.\n        config (dict): The config dictionary.\n\n    Returns:\n        pd.DataFrame: The combined metadata dataframe.\n        pd.DataFrame: The updated metadata table.\n    \"\"\"\n    # use os.path.join to ensure compatibility with different operating systems\n    file_path = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_description.csv\"\n    )\n\n    if not config[\"training\"]:\n        return load_metadata(file_path), all_dataset_metadata\n\n    if config[\"type_of_data\"] == \"dataset\":\n        return process_dataset_metadata(\n            openml_data_object, data_id, all_dataset_metadata, file_path\n        )\n\n    if config[\"type_of_data\"] == \"flow\":\n        return process_flow_metadata(openml_data_object, data_id, file_path)\n\n    raise ValueError(f\"Unsupported type_of_data: {config['type_of_data']}\")\n
"},{"location":"modules/metadata_module/#metadata_utils.extract_attribute","title":"extract_attribute(attribute, attr_name)","text":"

Description: Extract an attribute from the OpenML object.

Input: attribute (object) : The OpenML object

Returns: The attribute value if it exists, else an empty string.

Source code in serving/modules/metadata_utils.py
def extract_attribute(attribute: object, attr_name: str) -> str:\n    \"\"\"\n    Description: Extract an attribute from the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The attribute value if it exists, else an empty string.\n    \"\"\"\n    return getattr(attribute, attr_name, \"\")\n
"},{"location":"modules/metadata_module/#metadata_utils.get_all_metadata_from_openml","title":"get_all_metadata_from_openml(config)","text":"

Description: Gets all the metadata from OpenML for the type of data specified in the config. If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.

This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.

Input: config (dict) : The config dictionary

Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.

Source code in serving/modules/metadata_utils.py
def get_all_metadata_from_openml(config: dict) -> Tuple[pd.DataFrame, Sequence[int], pd.DataFrame] | None:\n    \"\"\"\n    Description: Gets all the metadata from OpenML for the type of data specified in the config.\n    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.\n\n    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.\n\n\n    Input: config (dict) : The config dictionary\n\n    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.\n    \"\"\"\n\n    # save_filename = f\"./data/all_{config['type_of_data']}_metadata.pkl\"\n    # use os.path.join to ensure compatibility with different operating systems\n    save_filename = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_metadata.pkl\"\n    )\n    # If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.\n    # TODO : Check if this behavior is correct, or if data does not exist, send to training pipeline?\n    if config[\"training\"] == False or config[\"ignore_downloading_data\"] == True:\n        # print(\"[INFO] Training is set to False.\")\n        # Check if the metadata files exist for all types of data\n        if not os.path.exists(save_filename):\n            raise Exception(\n                \"Metadata files do not exist. Please run the training pipeline first.\"\n            )\n        print(\"[INFO] Loading metadata from file.\")\n        # Load the metadata files for all types of data\n        return load_metadata_from_file(save_filename)\n\n    # If we are training, we need to recreate the cache and get the metadata from OpenML\n    if config[\"training\"] == True:\n        print(\"[INFO] Training is set to True.\")\n        # Gather all OpenML objects of the type of data\n        all_objects = get_openml_objects(config[\"type_of_data\"])\n\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 2000 rows.\")\n            all_objects = all_objects[:2000]\n\n        data_id = [int(all_objects.iloc[i][\"did\"]) for i in range(len(all_objects))]\n\n        print(\"[INFO] Initializing cache.\")\n        initialize_cache(config[\"type_of_data\"], data_id)\n\n        print(f\"[INFO] Getting {config['type_of_data']} metadata from OpenML.\")\n        openml_data_object = get_metadata_from_openml(config, data_id)\n\n        print(\"[INFO] Saving metadata to file.\")\n        save_metadata_to_file((openml_data_object, data_id, all_objects), save_filename)\n\n        return openml_data_object, data_id, all_objects\n
"},{"location":"modules/metadata_module/#metadata_utils.get_dataset_description","title":"get_dataset_description(dataset_id)","text":"

Get the dataset description from OpenML using the dataset id

Input: dataset_id (int) : The dataset id

Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML

Source code in serving/modules/metadata_utils.py
def get_dataset_description(dataset_id) -> openml.datasets.dataset.OpenMLDataset:\n    \"\"\"\n    Get the dataset description from OpenML using the dataset id\n\n    Input: dataset_id (int) : The dataset id\n\n    Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML\n    \"\"\"\n    # TODO : Check for objects that do not have qualities being not downloaded properly\n    # try:\n    data = openml.datasets.get_dataset(\n        dataset_id=dataset_id,\n        download_data=False,\n        download_qualities=True,\n        download_features_meta_data=True,\n    )\n\n    return data\n
"},{"location":"modules/metadata_module/#metadata_utils.get_flow_description","title":"get_flow_description(flow_id)","text":"

Get the flow description from OpenML using the flow id

Input: flow_id (int) : The flow id

Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML

Source code in serving/modules/metadata_utils.py
def get_flow_description(flow_id: int) -> openml.flows.flow.OpenMLFlow:\n    \"\"\"\n    Get the flow description from OpenML using the flow id\n\n    Input: flow_id (int) : The flow id\n\n    Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML\n    \"\"\"\n    return openml.flows.get_flow(flow_id=flow_id)\n
"},{"location":"modules/metadata_module/#metadata_utils.get_metadata_from_openml","title":"get_metadata_from_openml(config, data_id)","text":"

Get metadata from OpenML using parallel processing.

Source code in serving/modules/metadata_utils.py
def get_metadata_from_openml(config, data_id: Sequence[int]):\n    \"\"\"\n    Get metadata from OpenML using parallel processing.\n    \"\"\"\n    if config[\"type_of_data\"] == \"dataset\":\n        return pqdm(\n            data_id, get_dataset_description, n_jobs=config[\"data_download_n_jobs\"]\n        )\n    elif config[\"type_of_data\"] == \"flow\":\n        return pqdm(\n            data_id, get_flow_description, n_jobs=config[\"data_download_n_jobs\"]\n        )\n
"},{"location":"modules/metadata_module/#metadata_utils.get_openml_objects","title":"get_openml_objects(type_of_data)","text":"

Get OpenML objects based on the type of data.

Source code in serving/modules/metadata_utils.py
def get_openml_objects(type_of_data: str):\n    \"\"\"\n    Get OpenML objects based on the type of data.\n    \"\"\"\n    if type_of_data == \"dataset\":\n        return openml.datasets.list_datasets(output_format=\"dataframe\")\n    elif type_of_data == \"flow\":\n        all_objects = openml.flows.list_flows(output_format=\"dataframe\")\n        return all_objects.rename(columns={\"id\": \"did\"})\n    else:\n        raise ValueError(\"Invalid type_of_data specified\")\n
"},{"location":"modules/metadata_module/#metadata_utils.initialize_cache","title":"initialize_cache(type_of_data, data_id)","text":"

Initialize cache for the OpenML objects.

Source code in serving/modules/metadata_utils.py
def initialize_cache(type_of_data: str, data_id: Sequence[int]) -> None:\n    \"\"\"\n    Initialize cache for the OpenML objects.\n    \"\"\"\n    if type_of_data == \"dataset\":\n        get_dataset_description(data_id[0])\n    elif type_of_data == \"flow\":\n        get_flow_description(data_id[0])\n
"},{"location":"modules/metadata_module/#metadata_utils.join_attributes","title":"join_attributes(attribute, attr_name)","text":"

Description: Join the attributes of the OpenML object.

Input: attribute (object) : The OpenML object

Returns: The joined attributes if they exist, else an empty string. example: \"column - value, column - value, ...\"

Source code in serving/modules/metadata_utils.py
def join_attributes(attribute: object, attr_name: str) -> str:\n    \"\"\"\n    Description: Join the attributes of the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The joined attributes if they exist, else an empty string.\n    example: \"column - value, column - value, ...\"\n    \"\"\"\n\n    return (\n        \" \".join([f\"{k} : {v},\" for k, v in getattr(attribute, attr_name, {}).items()])\n        if hasattr(attribute, attr_name)\n        else \"\"\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.load_metadata_from_file","title":"load_metadata_from_file(save_filename)","text":"

Load metadata from a file.

Source code in serving/modules/metadata_utils.py
def load_metadata_from_file(save_filename: str) -> Tuple[pd.DataFrame, Sequence[int], pd.DataFrame]:\n    \"\"\"\n    Load metadata from a file.\n    \"\"\"\n    with open(save_filename, \"rb\") as f:\n        return pickle.load(f)\n
"},{"location":"modules/metadata_module/#metadata_utils.merge_all_columns_to_string","title":"merge_all_columns_to_string(row)","text":"

Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"

Input: row (pd.Series) : The row of the dataframe

Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"

Source code in serving/modules/metadata_utils.py
def merge_all_columns_to_string(row: pd.Series) -> str:\n    \"\"\"\n    Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    Input: row (pd.Series) : The row of the dataframe\n\n    Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n    \"\"\"\n\n    return \" \".join([f\"{col} - {val},\" for col, val in zip(row.index, row.values)])\n
"},{"location":"modules/metadata_module/#metadata_utils.process_dataset_metadata","title":"process_dataset_metadata(openml_data_object, data_id, all_dataset_metadata, file_path)","text":"

Description: Process the dataset metadata.

Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path

Returns: The combined metadata dataframe and the updated metadata table.

Source code in serving/modules/metadata_utils.py
def process_dataset_metadata(\n    openml_data_object: Sequence[openml.datasets.dataset.OpenMLDataset], data_id: Sequence[int], all_dataset_metadata: pd.DataFrame, file_path: str\n) -> Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Process the dataset metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    descriptions = [\n        extract_attribute(attr, \"description\") for attr in openml_data_object\n    ]\n    joined_qualities = [\n        join_attributes(attr, \"qualities\") for attr in openml_data_object\n    ]\n    joined_features = [join_attributes(attr, \"features\") for attr in openml_data_object]\n\n    all_data_description_df = create_combined_information_df(\n        data_id, descriptions, joined_qualities, joined_features\n    )\n    all_dataset_metadata = combine_metadata(\n        all_dataset_metadata, all_data_description_df\n    )\n\n    all_dataset_metadata.to_csv(file_path)\n\n    return (\n        all_dataset_metadata[[\"did\", \"name\", \"Combined_information\"]],\n        all_dataset_metadata,\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.process_flow_metadata","title":"process_flow_metadata(openml_data_object, data_id, file_path)","text":"

Description: Process the flow metadata.

Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path

Returns: The combined metadata dataframe and the updated metadata table.

Source code in serving/modules/metadata_utils.py
def process_flow_metadata(openml_data_object: Sequence[openml.flows.flow.OpenMLFlow], data_id: Sequence[int], file_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Process the flow metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    descriptions = [\n        extract_attribute(attr, \"description\") for attr in openml_data_object\n    ]\n    names = [extract_attribute(attr, \"name\") for attr in openml_data_object]\n    tags = [extract_attribute(attr, \"tags\") for attr in openml_data_object]\n\n    all_data_description_df = pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"name\": names,\n            \"tags\": tags,\n        }\n    )\n\n    all_data_description_df[\"Combined_information\"] = all_data_description_df.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    all_data_description_df.to_csv(file_path)\n\n    return (\n        all_data_description_df[[\"did\", \"name\", \"Combined_information\"]],\n        all_data_description_df,\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.save_metadata_to_file","title":"save_metadata_to_file(data, save_filename)","text":"

Save metadata to a file.

Source code in serving/modules/metadata_utils.py
def save_metadata_to_file(data, save_filename: str):\n    \"\"\"\n    Save metadata to a file.\n    \"\"\"\n    with open(save_filename, \"wb\") as f:\n        pickle.dump(data, f)\n
"},{"location":"modules/result_gen/","title":"Result gen","text":""},{"location":"modules/result_gen/#results_gen.aggregate_multiple_queries_and_count","title":"aggregate_multiple_queries_and_count(queries, qa_dataset, config, group_cols=['id', 'name'], sort_by='query')","text":"

Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results

Input

queries: List of queries group_cols: List of columns to group by

Returns: Combined dataframe with the results of all queries

Source code in serving/modules/results_gen.py
def aggregate_multiple_queries_and_count(\n    queries, qa_dataset, config, group_cols=[\"id\", \"name\"], sort_by=\"query\"\n) -> pd.DataFrame:\n    \"\"\"\n    Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results\n\n    Input:\n        queries: List of queries\n        group_cols: List of columns to group by\n\n    Returns: Combined dataframe with the results of all queries\n    \"\"\"\n    combined_df = pd.DataFrame()\n    for query in tqdm(queries, total=len(queries)):\n        result_data_frame = get_result_from_query(\n            query=query, qa=qa_dataset, type_of_query=\"dataset\", config=config\n        )\n        result_data_frame = result_data_frame[group_cols]\n        # Concat with combined_df with a column to store the query\n        result_data_frame[\"query\"] = query\n        combined_df = pd.concat([combined_df, result_data_frame])\n    combined_df = (\n        combined_df.groupby(group_cols)\n        .count()\n        .reset_index()\n        .sort_values(by=sort_by, ascending=False)\n    )\n    return combined_df\n
"},{"location":"modules/result_gen/#results_gen.check_query","title":"check_query(query)","text":"

Description: Performs checks on the query - Replaces %20 with space character (browsers do this automatically when spaces are in the URL) - Removes leading and trailing spaces - Limits the query to 150 characters

Input: query (str)

Returns: None

Source code in serving/modules/results_gen.py
def check_query(query: str) -> str:\n    \"\"\"\n    Description: Performs checks on the query\n    - Replaces %20 with space character (browsers do this automatically when spaces are in the URL)\n    - Removes leading and trailing spaces\n    - Limits the query to 150 characters\n\n    Input: query (str)\n\n    Returns: None\n    \"\"\"\n    if query == \"\":\n        raise ValueError(\"Query cannot be empty.\")\n    query = query.replace(\n        \"%20\", \" \"\n    )  # replace %20 with space character (browsers do this automatically when spaces are in the URL)\n    # query = query.replace(\"dataset\", \"\")\n    # query = query.replace(\"flow\", \"\")\n    query = query.strip()\n    query = query[:200]\n    return query\n
"},{"location":"modules/result_gen/#results_gen.create_output_dataframe","title":"create_output_dataframe(dict_results, type_of_data, ids_order)","text":"

Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.

Input: dict_results (dict), type_of_data (str)

Returns: A dataframe with the results and duplicate names removed.

Source code in serving/modules/results_gen.py
def create_output_dataframe(dict_results: dict, type_of_data: str, ids_order: list) -> pd.DataFrame:\n    \"\"\"\n    Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.\n\n    Input: dict_results (dict), type_of_data (str)\n\n    Returns: A dataframe with the results and duplicate names removed.\n    \"\"\"\n    output_df = pd.DataFrame(dict_results).T.reset_index()\n    # order the rows based on the order of the ids\n    output_df[\"index\"] = output_df[\"index\"].astype(int)\n    output_df = output_df.set_index(\"index\").loc[ids_order].reset_index()\n    # output_df[\"urls\"] = output_df[\"index\"].apply(\n    #     lambda x: f\"https://www.openml.org/api/v1/json/{type_of_data}/{x}\"\n    # )\n    # https://www.openml.org/search?type=data&sort=runs&status=any&id=31\n    output_df[\"urls\"] = output_df[\"index\"].apply(\n        lambda x: f\"https://www.openml.org/search?type={type_of_data}&id={x}\"\n    )\n    output_df[\"urls\"] = output_df[\"urls\"].apply(make_clickable)\n    # data = openml.datasets.get_dataset(\n    # get rows with unique names\n    if type_of_data == \"data\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"dataset = openml.datasets.get_dataset({x})\"\n        )\n    elif type_of_data == \"flow\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"flow = openml.flows.get_flow({x})\"\n        )\n    output_df = output_df.drop_duplicates(subset=[\"name\"])\n    # order the columns\n    output_df = output_df[[\"index\", \"name\", \"command\", \"urls\", \"page_content\"]].rename(\n        columns={\"index\": \"id\", \"urls\": \"OpenML URL\", \"page_content\": \"Description\"}\n    )\n    return output_df\n
"},{"location":"modules/result_gen/#results_gen.fetch_results","title":"fetch_results(query, qa, type_of_query, config)","text":"

Description: Fetch results for the query using the QA chain.

Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)

Returns: results[\"source_documents\"] (list)

Source code in serving/modules/results_gen.py
def fetch_results(query: str, qa: langchain.chains.retrieval_qa.base.RetrievalQA, type_of_query: str, config: dict) -> Sequence[Document]:\n    \"\"\"\n    Description: Fetch results for the query using the QA chain.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)\n\n    Returns: results[\"source_documents\"] (list)\n    \"\"\"\n    results = qa.invoke(\n        input=query,\n        config={\"temperature\": config[\"temperature\"], \"top-p\": config[\"top_p\"]},\n    )\n    if config[\"long_context_reorder\"] == True:\n        results = long_context_reorder(results)\n    id_column = {\"dataset\": \"did\", \"flow\": \"id\", \"data\": \"did\"}\n    id_column = id_column[type_of_query]\n\n    if config[\"reranking\"] == True:\n        try:\n            print(\"[INFO] Reranking results...\")\n            ranker = Ranker(model_name=\"ms-marco-MiniLM-L-12-v2\", cache_dir=\"/tmp/\")\n            rerankrequest = RerankRequest(\n                query=query,\n                passages=[\n                    {\"id\": result.metadata[id_column], \"text\": result.page_content}\n                    for result in results\n                ],\n            )\n            ranking = ranker.rerank(rerankrequest)\n            ids = [result[\"id\"] for result in ranking]\n            ranked_results = [\n                result for result in results if result.metadata[id_column] in ids\n            ]\n            print(\"[INFO] Reranking complete.\")\n            return ranked_results\n        except Exception as e:\n            print(f\"[ERROR] Reranking failed: {e}\")\n            return results\n\n    else:\n        return results\n
"},{"location":"modules/result_gen/#results_gen.get_result_from_query","title":"get_result_from_query(query, qa, type_of_query, config)","text":"

Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.

Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)

Returns: output_df (pd.DataFrame)

Source code in serving/modules/results_gen.py
def get_result_from_query(query, qa, type_of_query, config) -> Tuple[pd.DataFrame, Sequence[Document]]:\n    \"\"\"\n    Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)\n\n    Returns: output_df (pd.DataFrame)\n    \"\"\"\n    if type_of_query == \"dataset\":\n        # Fixing the key_name for dataset because of the way the OpenML API returns the data\n        type_of_query = \"data\"\n    elif type_of_query == \"flow\":\n        type_of_query = \"flow\"\n    else:\n        raise ValueError(f\"Unsupported type_of_data: {type_of_query}\")\n\n    # Process the query\n    query = check_query(query)\n    if query == \"\":\n        return pd.DataFrame(), []\n    source_documents = fetch_results(\n        query, qa, config=config, type_of_query=type_of_query\n    )\n    dict_results, ids_order = process_documents(source_documents)\n    output_df = create_output_dataframe(dict_results, type_of_query, ids_order)\n\n    return output_df, source_documents\n
"},{"location":"modules/result_gen/#results_gen.long_context_reorder","title":"long_context_reorder(results)","text":"

Description: Lost in the middle reorder: the less relevant documents will be at the middle of the list and more relevant elements at beginning / end. See: https://arxiv.org/abs//2307.03172

Input: results (list)

Returns: reorder results (list)

Source code in serving/modules/results_gen.py
def long_context_reorder(results: Sequence[Document]) -> Sequence[Document]:\n    \"\"\"\n    Description: Lost in the middle reorder: the less relevant documents will be at the\n    middle of the list and more relevant elements at beginning / end.\n    See: https://arxiv.org/abs//2307.03172\n\n    Input: results (list)\n\n    Returns: reorder results (list)\n    \"\"\"\n    print(\"[INFO] Reordering results...\")\n    reordering = LongContextReorder()\n    results = reordering.transform_documents(results)\n    print(\"[INFO] Reordering complete.\")\n    return results\n
"},{"location":"modules/result_gen/#results_gen.make_clickable","title":"make_clickable(val)","text":"

Description: Make the URL clickable in the dataframe.

Source code in serving/modules/results_gen.py
def make_clickable(val : str) -> str:\n    \"\"\"\n    Description: Make the URL clickable in the dataframe.\n    \"\"\"\n    return '<a href=\"{}\">{}</a>'.format(val, val)\n
"},{"location":"modules/result_gen/#results_gen.process_documents","title":"process_documents(source_documents)","text":"

Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.

Input: source_documents (list), key_name (str)

Returns: dict_results (dict)

Source code in serving/modules/results_gen.py
def process_documents(source_documents : Sequence[Document]) -> Tuple[OrderedDict, list]:\n    \"\"\"\n    Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.\n\n    Input: source_documents (list), key_name (str)\n\n    Returns: dict_results (dict)\n    \"\"\"\n    dict_results = OrderedDict()\n    for result in source_documents:\n        dict_results[result.metadata[\"did\"]] = {\n            \"name\": result.metadata[\"name\"],\n            \"page_content\": result.page_content,\n        }\n    ids = [result.metadata[\"did\"] for result in source_documents]\n    return dict_results, ids\n
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"RAG pipeline for OpenML","text":"
  • This repository contains the code for the RAG pipeline for OpenML.
"},{"location":"#getting-started","title":"Getting started","text":"
  • A docker image will be provided at a later date for easier setup
  • Clone the repository
  • Create a virtual environment and activate it
  • Install the requirements using pip install -r requirements.txt
  • Run training.py (for the first time/to update the model). This takes care of basically everything. (Refer to the training section for more details)
  • Install Ollama (https://ollama.com/) and download the models ollama run qwen2:1.5b and ollama run phi3
  • Run uvicorn backend:app to start the FastAPI server.
  • Run streamlit run main.py to start the Streamlit frontend (this uses the FastAPI server so make sure it is running)
  • Enjoy :)
"},{"location":"configuration/","title":"Configuration","text":"
  • The main config file is config.json
  • Possible options are as follows:
  • rqa_prompt_template: The template for the RAG pipeline search prompt. This is used by the model to query the database.
  • llm_prompt_template: The template for the summary generator LLM prompt.
  • num_return_documents: Number of documents to return for a query. Too high a number can lead to Out of Memory errors. (Defaults to 50)
  • embedding_model: The model to use for generating embeddings. This is used to generate embeddings for the documents as a means of comparison using the LLM's embeddings. (Defaults to BAAI/bge-large-en-v1.5)
    • Other possible tested models
      • BAAI/bge-base-en-v1.5
      • BAAI/bge-large-en-v1.5
      • WhereIsAI/UAE-Large-V1
  • llm_model: The model used for generating the result summary. (Defaults to qwen2:1.5b)
  • data_dir: The directory to store the intermediate data like tables/databases etc. (Defaults to ./data/)
  • persist_dir: The directory to store the cached data. Defaults to ./data/chroma_db/ and stores the embeddings for the documents with a unique hash. (Defaults to ./data/chroma_db/)
  • testing_flag: Enables testing mode by using subsets of the data for quick debugging. This is used to test the pipeline and is not recommended for normal use. (Defaults to False)
  • data_download_n_jobs: Number of jobs to run in parallel for downloading data. (Defaults to 20)
  • training: Whether to train the model or not. (Defaults to False) this is automatically set to True when when running the training.py script. Do NOT set this to True manually.
  • search_type : The type of vector comparison to use. (Defaults to \"similarity\")
  • reraanking: Whether to rerank the results using the FlashRank algorithm. (Defaults to False)
  • long_context_reordering: Whether to reorder the results using the Long Context Reordering algorithm. (Defaults to False)
"},{"location":"docker/","title":"Docker container","text":"
  • Still a WIP
  • Run docker compose build --progress=plain
"},{"location":"docker/#potential-errors","title":"Potential Errors","text":"
  • If you get a memory error you can run docker system prune. Please be careful with this command as it will remove all stopped containers, all dangling images, and all unused networks. So ensure you have no important data in any of the containers before running this command.
  • On docker desktop for Mac, increase memory limits to as much as your system can handle.
"},{"location":"inference/","title":"Inference","text":"
  • Run the inference using uvicorn main:app and streamlit run main.py in different processes.
"},{"location":"testing/","title":"Testing","text":""},{"location":"testing/#unit-testing","title":"Unit Testing","text":"
  • Run python -m unittest tests/unit_testing.py to run the unit tests.
"},{"location":"testing/#load-testing","title":"Load Testing","text":"
  • Load testing can be done using Locust, a load testing tool that allows you to simulate users querying the API and measure the performance of the API under load from numerous users.
  • It is possible to configure the number of users, the hatch rate, and the time to run the test for.
"},{"location":"testing/#running-the-load-test","title":"Running the load test","text":"
  • Start the FastAPI server using uvicorn main:app
  • Load testing using Locust (locust -f tests/locust_test.py --host http://127.0.0.1:8000 ) using a different terminal
"},{"location":"testing/#all-tests","title":"All tests","text":"

Bases: TestCase

Source code in tests/unit_testing.py
class TestConfig(unittest.TestCase):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.client = chromadb.PersistentClient(path=config[\"persist_dir\"])\n        self.config_keys = [\"rqa_prompt_template\", \"llm_prompt_template\",\n        \"num_return_documents\", \"embedding_model\", \"llm_model\", \"num_documents_for_llm\", \"data_dir\", \"persist_dir\", \"testing_flag\", \"ignore_downloading_data\", \"test_subset_2000\", \"data_download_n_jobs\", \"training\", \"temperature\", \"top_p\", \"search_type\", \"reranking\", \"long_context_reorder\"]\n        self.query_test_dict = {\n            \"dataset\": \"Find me a dataset about flowers that has a high number of instances.\",\n            \"flow\": \"Find me a flow that uses the RandomForestClassifier.\",\n        }\n    def test_check_data_dirs(self):\n        \"\"\"\n        Description: Check if the data directory exists.\n        Returns: None\n        \"\"\"\n        self.assertTrue(os.path.exists(config[\"data_dir\"]))\n        self.assertTrue(os.path.exists(config[\"persist_dir\"]))\n\n    def test_config(self):\n        \"\"\"\n        Description: Check if the config has the required keys.\n        Returns: None\n        \"\"\"\n        for key in self.config_keys:\n            self.assertIn(key, config.keys())\n\n    def test_setup_vector_db_and_qa(self):\n        \"\"\"\n        Description: Check if the setup_vector_db_and_qa function works as expected.\n        Returns: None\n        \"\"\"\n        for type_of_data in [\"dataset\", \"flow\"]:\n            self.qa = setup_vector_db_and_qa(\n                config=config, data_type=type_of_data, client=self.client\n            )\n            self.assertIsNotNone(self.qa)\n            self.result_data_frame = get_result_from_query(\n                query=self.query_test_dict[type_of_data],\n                qa=self.qa,\n                type_of_query=type_of_data,\n                config=config,\n            )\n            self.assertIsNotNone(self.result_data_frame)\n
"},{"location":"testing/#unit_testing.TestConfig.test_check_data_dirs","title":"test_check_data_dirs()","text":"

Description: Check if the data directory exists. Returns: None

Source code in tests/unit_testing.py
def test_check_data_dirs(self):\n    \"\"\"\n    Description: Check if the data directory exists.\n    Returns: None\n    \"\"\"\n    self.assertTrue(os.path.exists(config[\"data_dir\"]))\n    self.assertTrue(os.path.exists(config[\"persist_dir\"]))\n
"},{"location":"testing/#unit_testing.TestConfig.test_config","title":"test_config()","text":"

Description: Check if the config has the required keys. Returns: None

Source code in tests/unit_testing.py
def test_config(self):\n    \"\"\"\n    Description: Check if the config has the required keys.\n    Returns: None\n    \"\"\"\n    for key in self.config_keys:\n        self.assertIn(key, config.keys())\n
"},{"location":"testing/#unit_testing.TestConfig.test_setup_vector_db_and_qa","title":"test_setup_vector_db_and_qa()","text":"

Description: Check if the setup_vector_db_and_qa function works as expected. Returns: None

Source code in tests/unit_testing.py
def test_setup_vector_db_and_qa(self):\n    \"\"\"\n    Description: Check if the setup_vector_db_and_qa function works as expected.\n    Returns: None\n    \"\"\"\n    for type_of_data in [\"dataset\", \"flow\"]:\n        self.qa = setup_vector_db_and_qa(\n            config=config, data_type=type_of_data, client=self.client\n        )\n        self.assertIsNotNone(self.qa)\n        self.result_data_frame = get_result_from_query(\n            query=self.query_test_dict[type_of_data],\n            qa=self.qa,\n            type_of_query=type_of_data,\n            config=config,\n        )\n        self.assertIsNotNone(self.result_data_frame)\n
"},{"location":"training/","title":"Training","text":"
  • While we are not creating a new model, we are using the existing model to create embeddings. The name might be misleading but this was chosen as an attempt to keep the naming consistent with other codebases.
  • (Perhaps we might fine tune the model in the future)
  • The training script is present in training.py. Running this script will take care of everything.
"},{"location":"training/#what-does-the-training-script-do","title":"What does the training script do?","text":"
  • Load the config file and set the necessary variables
  • If testing_flag is set to True, the script will use a subset of the data for quick debugging
  • testing_flag is set to True
  • persist_dir is set to ./data/chroma_db_testing
  • test_subset_2000 is set to True
  • data_dir is set to ./data/testing_data/
  • If testing_flag is set to False, the script will use the entire dataset
  • For all datasets in the OpenML dataset list:
  • Download the dataset
  • Create the vector dataset with computed embeddings
  • Create a vectordb retriever
  • Run some test queries
"},{"location":"modules/general_utils/","title":"General utils","text":""},{"location":"modules/general_utils/#general_utils.find_device","title":"find_device(training=False)","text":"

Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.

Input: training (bool) : Whether the pipeline is being used for training or not.

Returns: device (str) : The device to use for the pipeline.

Source code in serving/modules/general_utils.py
def find_device(training: bool = False ) -> str:\n    \"\"\"\n    Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.\n\n    Input: training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: device (str) : The device to use for the pipeline.\n    \"\"\"\n    print(\"[INFO] Finding device.\")\n    if torch.cuda.is_available():\n        return \"cuda\"\n    elif torch.backends.mps.is_available():\n        if training == False:\n            # loading metadata on mps for inference is quite slow. So disabling for now.\n            return \"cpu\"\n        return \"mps\"\n    else:\n        return \"cpu\"\n
"},{"location":"modules/general_utils/#general_utils.load_config_and_device","title":"load_config_and_device(config_file, training=False)","text":"

Description: Load the config file and find the device to use for the pipeline.

Input: config_file (str) : The path to the config file. training (bool) : Whether the pipeline is being used for training or not.

Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.

Source code in serving/modules/general_utils.py
def load_config_and_device(config_file: str, training: bool = False) -> dict:\n    \"\"\"\n    Description: Load the config file and find the device to use for the pipeline.\n\n    Input: config_file (str) : The path to the config file.\n    training (bool) : Whether the pipeline is being used for training or not.\n\n    Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.\n    \"\"\"\n    # Check if the config file exists and load it\n    if not os.path.exists(config_file):\n        raise Exception(\"Config file does not exist.\")\n    with open(config_file, \"r\") as f:\n        config = json.load(f)\n\n    # Find device and set it in the config between cpu and cuda and mps if available\n    config[\"device\"] = find_device(training)\n    print(f\"[INFO] Device found: {config['device']}\")\n    return config\n
"},{"location":"modules/llm_module/","title":"Llm module","text":""},{"location":"modules/llm_module/#llm.add_documents_to_db","title":"add_documents_to_db(db, unique_docs, unique_ids)","text":"

Description: Add documents to the vector store in batches of 200.

Input: db (Chroma), unique_docs (list), unique_ids (list)

Returns: None

Source code in serving/modules/llm.py
def add_documents_to_db(db, unique_docs, unique_ids):\n    \"\"\"\n    Description: Add documents to the vector store in batches of 200.\n\n    Input: db (Chroma), unique_docs (list), unique_ids (list)\n\n    Returns: None\n    \"\"\"\n    bs = 512\n    if len(unique_docs) < bs:\n        db.add_documents(unique_docs, ids=unique_ids)\n    else:\n        for i in tqdm(range(0, len(unique_docs), bs)):\n            db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])\n
"},{"location":"modules/llm_module/#llm.create_vector_store","title":"create_vector_store(metadata_df, chroma_client, config, embeddings, collection_name)","text":"

Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.

Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)

Returns: db (Chroma)

Source code in serving/modules/llm.py
def create_vector_store(\n    metadata_df: pd.DataFrame, chroma_client:ClientAPI, config: dict, embeddings: HuggingFaceEmbeddings, collection_name: str \n) -> Chroma:\n    \"\"\"\n    Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.\n\n    Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: db (Chroma)\n    \"\"\"\n\n    db = Chroma(\n        client=chroma_client,\n        embedding_function=embeddings,\n        persist_directory=config[\"persist_dir\"],\n        collection_name=collection_name,\n    )\n\n    documents = load_and_process_data(\n        metadata_df, page_content_column=\"Combined_information\"\n    )\n    if config[\"testing_flag\"]:\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 2000 rows.\")\n            documents = documents[:2000]\n    unique_docs, unique_ids = generate_unique_documents(documents, db)\n\n    print(\n        f\"Number of unique documents: {len(unique_docs)} vs Total documents: {len(documents)}\"\n    )\n    if len(unique_docs) == 0:\n        print(\"No new documents to add.\")\n        return db\n    else:\n        # db.add_documents(unique_docs, ids=unique_ids)\n        add_documents_to_db(db, unique_docs, unique_ids)\n\n    return db\n
"},{"location":"modules/llm_module/#llm.generate_unique_documents","title":"generate_unique_documents(documents, db)","text":"Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.

Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist

Input: documents (list)

Returns: unique_docs (list), unique_ids (list)

Source code in serving/modules/llm.py
def generate_unique_documents(documents: list, db: Chroma) -> tuple:\n    \"\"\"\n    Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.\n        Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist\n\n    Input: documents (list)\n\n    Returns: unique_docs (list), unique_ids (list)\n    \"\"\"\n\n    # Remove duplicates based on ID (from database)\n    new_document_ids = set([str(x.metadata[\"did\"]) for x in documents])\n    print(f\"[INFO] Generating unique documents. Total documents: {len(documents)}\")\n    try:\n        old_dids = set([str(x[\"did\"]) for x in db.get()[\"metadatas\"]])\n    except KeyError:\n        old_dids = set([str(x[\"id\"]) for x in db.get()[\"metadatas\"]])\n\n    new_dids = new_document_ids - old_dids\n    documents = [x for x in documents if str(x.metadata[\"did\"]) in new_dids]\n    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS,doc.page_content)) for doc in documents]\n\n    # Remove duplicates based on document content (from new documents)\n    unique_ids = list(set(ids))\n    seen_ids = set()\n    unique_docs = [\n            doc\n            for doc, id in zip(documents, ids)\n            if id not in seen_ids and (seen_ids.add(id) or True)\n        ]\n\n    return unique_docs, unique_ids\n
"},{"location":"modules/llm_module/#llm.get_collection_name","title":"get_collection_name(config)","text":"

Description: Get the collection name based on the type of data provided in the config.

Input: config (dict)

Returns: str

Source code in serving/modules/llm.py
def get_collection_name(config: dict) -> str:\n    \"\"\"\n    Description: Get the collection name based on the type of data provided in the config.\n\n    Input: config (dict)\n\n    Returns: str\n    \"\"\"\n    return {\"dataset\": \"datasets\", \"flow\": \"flows\"}.get(\n        config[\"type_of_data\"], \"default\"\n    )\n
"},{"location":"modules/llm_module/#llm.get_llm_chain","title":"get_llm_chain(config)","text":"

Description: Get the LLM chain with the specified model and prompt template.

Input: config (dict)

Returns: LLMChain

Source code in serving/modules/llm.py
def get_llm_chain(config: dict) -> LLMChain:\n    \"\"\"\n    Description: Get the LLM chain with the specified model and prompt template.\n\n    Input: config (dict)\n\n    Returns: LLMChain\n    \"\"\"\n\n    llm = Ollama(\n        model = config[\"llm_model\"] \n    )  \n    map_template = config[\"llm_prompt_template\"]\n    map_prompt = PromptTemplate.from_template(map_template)\n    return LLMChain(llm=llm, prompt=map_prompt)\n
"},{"location":"modules/llm_module/#llm.initialize_llm_chain","title":"initialize_llm_chain(vectordb, config)","text":"

Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.

Input: vectordb (Chroma), config (dict)

Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)

Source code in serving/modules/llm.py
def initialize_llm_chain(\n    vectordb: Chroma,\n    config : dict\n) -> langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.\n\n    Input: vectordb (Chroma), config (dict)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    return vectordb.as_retriever(\n        search_type=config[\"search_type\"],\n        search_kwargs={\"k\": config[\"num_return_documents\"]},\n    )\n
"},{"location":"modules/llm_module/#llm.load_and_process_data","title":"load_and_process_data(metadata_df, page_content_column)","text":"

Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.

Input: metadata_df (pd.DataFrame), page_content_column (str)

Returns: chunked documents (list)

Source code in serving/modules/llm.py
def load_and_process_data(metadata_df: pd.DataFrame, page_content_column: str) -> list:\n    \"\"\"\n    Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.\n\n    Input: metadata_df (pd.DataFrame), page_content_column (str)\n\n    Returns: chunked documents (list)\n    \"\"\"\n    # Load data\n    loader = DataFrameLoader(metadata_df, page_content_column=page_content_column)\n    documents = loader.load()\n\n    # Split documents\n    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n    documents = text_splitter.split_documents(documents)\n\n    return documents\n
"},{"location":"modules/llm_module/#llm.load_document_and_create_vector_store","title":"load_document_and_create_vector_store(metadata_df, chroma_client, config)","text":"

Loads the documents and creates the vector store. If the training flag is set to True, the documents are added to the vector store. If the training flag is set to False, the vector store is loaded from the persist directory.

Parameters:

Name Type Description Default metadata_df DataFrame

The metadata dataframe.

required chroma_client PersistentClient

The Chroma client.

required config dict

The configuration dictionary.

required

Returns:

Name Type Description Chroma Chroma

The Chroma vector store.

Source code in serving/modules/llm.py
def load_document_and_create_vector_store(metadata_df: pd.DataFrame, chroma_client:ClientAPI , config: dict) -> Chroma:\n    \"\"\"\n    Loads the documents and creates the vector store. If the training flag is set to True,\n    the documents are added to the vector store. If the training flag is set to False,\n    the vector store is loaded from the persist directory.\n\n    Args:\n        metadata_df (pd.DataFrame): The metadata dataframe.\n        chroma_client (chromadb.PersistentClient): The Chroma client.\n        config (dict): The configuration dictionary.\n\n    Returns:\n        Chroma: The Chroma vector store.\n    \"\"\"\n    embeddings = load_model(config)\n    collection_name = get_collection_name(config)\n\n    if not config[\"training\"]:\n        return load_vector_store(chroma_client, config, embeddings, collection_name)\n\n    return create_vector_store(\n        metadata_df, chroma_client, config, embeddings, collection_name\n    )\n
"},{"location":"modules/llm_module/#llm.load_model","title":"load_model(config)","text":"

Description: Load the model using HuggingFaceEmbeddings.

Input: config (dict)

Returns: HuggingFaceEmbeddings

Source code in serving/modules/llm.py
def load_model(config: dict) -> HuggingFaceEmbeddings | None:\n    \"\"\"\n    Description: Load the model using HuggingFaceEmbeddings.\n\n    Input: config (dict)\n\n    Returns: HuggingFaceEmbeddings\n    \"\"\"\n    print(\"[INFO] Loading model...\")\n    model_kwargs = {\"device\": config[\"device\"], \"trust_remote_code\": True}\n    encode_kwargs = {\"normalize_embeddings\": True}\n    embeddings = HuggingFaceEmbeddings(\n        model_name=config[\"embedding_model\"],\n        model_kwargs=model_kwargs,\n        encode_kwargs=encode_kwargs,\n        show_progress = True,\n        # trust_remote_code=True\n    )\n    print(\"[INFO] Model loaded.\")\n    return embeddings\n
"},{"location":"modules/llm_module/#llm.load_vector_store","title":"load_vector_store(chroma_client, config, embeddings, collection_name)","text":"

Description: Load the vector store from the persist directory.

Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)

Returns: Chroma

Source code in serving/modules/llm.py
def load_vector_store(chroma_client: ClientAPI, config: dict, embeddings: HuggingFaceEmbeddings, collection_name: str) -> Chroma:\n    \"\"\"\n    Description: Load the vector store from the persist directory.\n\n    Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)\n\n    Returns: Chroma\n    \"\"\"\n    if not os.path.exists(config[\"persist_dir\"]):\n        raise Exception(\n            \"Persist directory does not exist. Please run the training pipeline first.\"\n        )\n\n    return Chroma(\n        client=chroma_client,\n        persist_directory=config[\"persist_dir\"],\n        embedding_function=embeddings,\n        collection_name=collection_name,\n    )\n
"},{"location":"modules/llm_module/#llm.setup_vector_db_and_qa","title":"setup_vector_db_and_qa(config, data_type, client)","text":"

Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage. This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.

Input: config (dict), data_type (str), client (chromadb.PersistentClient)

Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)

Source code in serving/modules/llm.py
def setup_vector_db_and_qa(config: dict, data_type: str, client:ClientAPI) -> langchain.chains.retrieval_qa.base.RetrievalQA:\n    \"\"\"\n    Description: Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.\n    This also downloads the embedding model if it does not exist. The QA chain is then initialized with the vector store and the configuration.\n\n    Input: config (dict), data_type (str), client (chromadb.PersistentClient)\n\n    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)\n    \"\"\"\n\n    config[\"type_of_data\"] = data_type\n    # Download the data if it does not exist\n    openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(\n        config=config\n    )\n    # Create the combined metadata dataframe\n    metadata_df, all_metadata = create_metadata_dataframe(\n        openml_data_object, data_id, all_metadata, config=config\n    )\n    # Create the vector store\n    vectordb = load_document_and_create_vector_store(\n        metadata_df, config=config, chroma_client=client\n    )\n    # Initialize the LLM chain and setup Retrieval QA\n    qa = initialize_llm_chain(vectordb=vectordb, config=config)\n    return qa\n
"},{"location":"modules/metadata_module/","title":"Metadata module","text":""},{"location":"modules/metadata_module/#metadata_utils.combine_metadata","title":"combine_metadata(all_dataset_metadata, all_data_description_df)","text":"

Description: Combine the descriptions with the metadata table.

Input: all_dataset_metadata (pd.DataFrame) : The metadata table, all_data_description_df (pd.DataFrame) : The descriptions

Returns: The combined metadata table.

Source code in serving/modules/metadata_utils.py
def combine_metadata(all_dataset_metadata: pd.DataFrame, all_data_description_df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"\n    Description: Combine the descriptions with the metadata table.\n\n    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,\n    all_data_description_df (pd.DataFrame) : The descriptions\n\n    Returns: The combined metadata table.\n    \"\"\"\n    # Combine the descriptions with the metadata table\n    all_dataset_metadata = pd.merge(\n        all_dataset_metadata, all_data_description_df, on=\"did\", how=\"inner\"\n    )\n\n    # Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    all_dataset_metadata[\"Combined_information\"] = all_dataset_metadata.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    return all_dataset_metadata\n
"},{"location":"modules/metadata_module/#metadata_utils.create_combined_information_df","title":"create_combined_information_df(data_id, descriptions, joined_qualities, joined_features)","text":"

Description: Create a dataframe with the combined information of the OpenML object.

Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object

Returns: The dataframe with the combined information of the OpenML object.

Source code in serving/modules/metadata_utils.py
def create_combined_information_df(\n    # data_id, descriptions, joined_qualities, joined_features\n    data_id: int| Sequence[int], descriptions: Sequence[str], joined_qualities: Sequence[str], joined_features: Sequence[str]\n) -> pd.DataFrame:\n    \"\"\"\n    Description: Create a dataframe with the combined information of the OpenML object.\n\n    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object\n\n    Returns: The dataframe with the combined information of the OpenML object.\n    \"\"\"\n    return pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"qualities\": joined_qualities,\n            \"features\": joined_features,\n        }\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.create_metadata_dataframe","title":"create_metadata_dataframe(openml_data_object, data_id, all_dataset_metadata, config)","text":"

Creates a dataframe with all the metadata, joined columns with all information for the type of data specified in the config. If training is set to False, the dataframes are loaded from the files. If training is set to True, the dataframes are created and then saved to the files.

Parameters:

Name Type Description Default openml_data_object list

The list of OpenML objects.

required data_id list

The list of data ids.

required all_dataset_metadata DataFrame

The metadata table.

required config dict

The config dictionary.

required

Returns:

Type Description DataFrame

pd.DataFrame: The combined metadata dataframe.

DataFrame

pd.DataFrame: The updated metadata table.

Source code in serving/modules/metadata_utils.py
def create_metadata_dataframe(\n    # openml_data_object, data_id, all_dataset_metadata, config\n    openml_data_object: Sequence[Union[openml.datasets.dataset.OpenMLDataset, openml.flows.flow.OpenMLFlow]], data_id: Sequence[int], all_dataset_metadata: pd.DataFrame, config: dict\n) -> Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Creates a dataframe with all the metadata, joined columns with all information\n    for the type of data specified in the config. If training is set to False,\n    the dataframes are loaded from the files. If training is set to True, the\n    dataframes are created and then saved to the files.\n\n    Args:\n        openml_data_object (list): The list of OpenML objects.\n        data_id (list): The list of data ids.\n        all_dataset_metadata (pd.DataFrame): The metadata table.\n        config (dict): The config dictionary.\n\n    Returns:\n        pd.DataFrame: The combined metadata dataframe.\n        pd.DataFrame: The updated metadata table.\n    \"\"\"\n    # use os.path.join to ensure compatibility with different operating systems\n    file_path = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_description.csv\"\n    )\n\n    if not config[\"training\"]:\n        return load_metadata(file_path), all_dataset_metadata\n\n    if config[\"type_of_data\"] == \"dataset\":\n        return process_dataset_metadata(\n            openml_data_object, data_id, all_dataset_metadata, file_path\n        )\n\n    if config[\"type_of_data\"] == \"flow\":\n        return process_flow_metadata(openml_data_object, data_id, file_path)\n\n    raise ValueError(f\"Unsupported type_of_data: {config['type_of_data']}\")\n
"},{"location":"modules/metadata_module/#metadata_utils.extract_attribute","title":"extract_attribute(attribute, attr_name)","text":"

Description: Extract an attribute from the OpenML object.

Input: attribute (object) : The OpenML object

Returns: The attribute value if it exists, else an empty string.

Source code in serving/modules/metadata_utils.py
def extract_attribute(attribute: object, attr_name: str) -> str:\n    \"\"\"\n    Description: Extract an attribute from the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The attribute value if it exists, else an empty string.\n    \"\"\"\n    return getattr(attribute, attr_name, \"\")\n
"},{"location":"modules/metadata_module/#metadata_utils.get_all_metadata_from_openml","title":"get_all_metadata_from_openml(config)","text":"

Description: Gets all the metadata from OpenML for the type of data specified in the config. If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.

This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.

Input: config (dict) : The config dictionary

Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.

Source code in serving/modules/metadata_utils.py
def get_all_metadata_from_openml(config: dict) -> Tuple[pd.DataFrame, Sequence[int], pd.DataFrame] | None:\n    \"\"\"\n    Description: Gets all the metadata from OpenML for the type of data specified in the config.\n    If training is set to False, it loads the metadata from the files. If training is set to True, it gets the metadata from OpenML.\n\n    This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.\n\n\n    Input: config (dict) : The config dictionary\n\n    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.\n    \"\"\"\n\n    # save_filename = f\"./data/all_{config['type_of_data']}_metadata.pkl\"\n    # use os.path.join to ensure compatibility with different operating systems\n    save_filename = os.path.join(\n        config[\"data_dir\"], f\"all_{config['type_of_data']}_metadata.pkl\"\n    )\n    # If we are not training, we do not need to recreate the cache and can load the metadata from the files. If the files do not exist, raise an exception.\n    # TODO : Check if this behavior is correct, or if data does not exist, send to training pipeline?\n    if config[\"training\"] == False or config[\"ignore_downloading_data\"] == True:\n        # print(\"[INFO] Training is set to False.\")\n        # Check if the metadata files exist for all types of data\n        if not os.path.exists(save_filename):\n            raise Exception(\n                \"Metadata files do not exist. Please run the training pipeline first.\"\n            )\n        print(\"[INFO] Loading metadata from file.\")\n        # Load the metadata files for all types of data\n        return load_metadata_from_file(save_filename)\n\n    # If we are training, we need to recreate the cache and get the metadata from OpenML\n    if config[\"training\"] == True:\n        print(\"[INFO] Training is set to True.\")\n        # Gather all OpenML objects of the type of data\n        all_objects = get_openml_objects(config[\"type_of_data\"])\n\n        # subset the data for testing\n        if config[\"test_subset_2000\"] == True:\n            print(\"[INFO] Subsetting the data to 2000 rows.\")\n            all_objects = all_objects[:2000]\n\n        data_id = [int(all_objects.iloc[i][\"did\"]) for i in range(len(all_objects))]\n\n        print(\"[INFO] Initializing cache.\")\n        initialize_cache(config[\"type_of_data\"], data_id)\n\n        print(f\"[INFO] Getting {config['type_of_data']} metadata from OpenML.\")\n        openml_data_object = get_metadata_from_openml(config, data_id)\n\n        print(\"[INFO] Saving metadata to file.\")\n        save_metadata_to_file((openml_data_object, data_id, all_objects), save_filename)\n\n        return openml_data_object, data_id, all_objects\n
"},{"location":"modules/metadata_module/#metadata_utils.get_dataset_description","title":"get_dataset_description(dataset_id)","text":"

Get the dataset description from OpenML using the dataset id

Input: dataset_id (int) : The dataset id

Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML

Source code in serving/modules/metadata_utils.py
def get_dataset_description(dataset_id) -> openml.datasets.dataset.OpenMLDataset:\n    \"\"\"\n    Get the dataset description from OpenML using the dataset id\n\n    Input: dataset_id (int) : The dataset id\n\n    Returns: data (openml.datasets.dataset.OpenMLDataset) : The dataset object from OpenML\n    \"\"\"\n    # TODO : Check for objects that do not have qualities being not downloaded properly\n    # try:\n    data = openml.datasets.get_dataset(\n        dataset_id=dataset_id,\n        download_data=False,\n        download_qualities=True,\n        download_features_meta_data=True,\n    )\n\n    return data\n
"},{"location":"modules/metadata_module/#metadata_utils.get_flow_description","title":"get_flow_description(flow_id)","text":"

Get the flow description from OpenML using the flow id

Input: flow_id (int) : The flow id

Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML

Source code in serving/modules/metadata_utils.py
def get_flow_description(flow_id: int) -> openml.flows.flow.OpenMLFlow:\n    \"\"\"\n    Get the flow description from OpenML using the flow id\n\n    Input: flow_id (int) : The flow id\n\n    Returns: data (openml.flows.flow.OpenMLFlow) : The flow object from OpenML\n    \"\"\"\n    return openml.flows.get_flow(flow_id=flow_id)\n
"},{"location":"modules/metadata_module/#metadata_utils.get_metadata_from_openml","title":"get_metadata_from_openml(config, data_id)","text":"

Get metadata from OpenML using parallel processing.

Source code in serving/modules/metadata_utils.py
def get_metadata_from_openml(config, data_id: Sequence[int]):\n    \"\"\"\n    Get metadata from OpenML using parallel processing.\n    \"\"\"\n    if config[\"type_of_data\"] == \"dataset\":\n        return pqdm(\n            data_id, get_dataset_description, n_jobs=config[\"data_download_n_jobs\"]\n        )\n    elif config[\"type_of_data\"] == \"flow\":\n        return pqdm(\n            data_id, get_flow_description, n_jobs=config[\"data_download_n_jobs\"]\n        )\n
"},{"location":"modules/metadata_module/#metadata_utils.get_openml_objects","title":"get_openml_objects(type_of_data)","text":"

Get OpenML objects based on the type of data.

Source code in serving/modules/metadata_utils.py
def get_openml_objects(type_of_data: str):\n    \"\"\"\n    Get OpenML objects based on the type of data.\n    \"\"\"\n    if type_of_data == \"dataset\":\n        return openml.datasets.list_datasets(output_format=\"dataframe\")\n    elif type_of_data == \"flow\":\n        all_objects = openml.flows.list_flows(output_format=\"dataframe\")\n        return all_objects.rename(columns={\"id\": \"did\"})\n    else:\n        raise ValueError(\"Invalid type_of_data specified\")\n
"},{"location":"modules/metadata_module/#metadata_utils.initialize_cache","title":"initialize_cache(type_of_data, data_id)","text":"

Initialize cache for the OpenML objects.

Source code in serving/modules/metadata_utils.py
def initialize_cache(type_of_data: str, data_id: Sequence[int]) -> None:\n    \"\"\"\n    Initialize cache for the OpenML objects.\n    \"\"\"\n    if type_of_data == \"dataset\":\n        get_dataset_description(data_id[0])\n    elif type_of_data == \"flow\":\n        get_flow_description(data_id[0])\n
"},{"location":"modules/metadata_module/#metadata_utils.join_attributes","title":"join_attributes(attribute, attr_name)","text":"

Description: Join the attributes of the OpenML object.

Input: attribute (object) : The OpenML object

Returns: The joined attributes if they exist, else an empty string. example: \"column - value, column - value, ...\"

Source code in serving/modules/metadata_utils.py
def join_attributes(attribute: object, attr_name: str) -> str:\n    \"\"\"\n    Description: Join the attributes of the OpenML object.\n\n    Input: attribute (object) : The OpenML object\n\n    Returns: The joined attributes if they exist, else an empty string.\n    example: \"column - value, column - value, ...\"\n    \"\"\"\n\n    return (\n        \" \".join([f\"{k} : {v},\" for k, v in getattr(attribute, attr_name, {}).items()])\n        if hasattr(attribute, attr_name)\n        else \"\"\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.load_metadata_from_file","title":"load_metadata_from_file(save_filename)","text":"

Load metadata from a file.

Source code in serving/modules/metadata_utils.py
def load_metadata_from_file(save_filename: str) -> Tuple[pd.DataFrame, Sequence[int], pd.DataFrame]:\n    \"\"\"\n    Load metadata from a file.\n    \"\"\"\n    with open(save_filename, \"rb\") as f:\n        return pickle.load(f)\n
"},{"location":"modules/metadata_module/#metadata_utils.merge_all_columns_to_string","title":"merge_all_columns_to_string(row)","text":"

Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"

Input: row (pd.Series) : The row of the dataframe

Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"

Source code in serving/modules/metadata_utils.py
def merge_all_columns_to_string(row: pd.Series) -> str:\n    \"\"\"\n    Description: Create a single column that has a combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n\n    Input: row (pd.Series) : The row of the dataframe\n\n    Returns: The combined string of all the metadata and the description in the form of \"column - value, column - value, ... description\"\n    \"\"\"\n\n    return \" \".join([f\"{col} - {val},\" for col, val in zip(row.index, row.values)])\n
"},{"location":"modules/metadata_module/#metadata_utils.process_dataset_metadata","title":"process_dataset_metadata(openml_data_object, data_id, all_dataset_metadata, file_path)","text":"

Description: Process the dataset metadata.

Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path

Returns: The combined metadata dataframe and the updated metadata table.

Source code in serving/modules/metadata_utils.py
def process_dataset_metadata(\n    openml_data_object: Sequence[openml.datasets.dataset.OpenMLDataset], data_id: Sequence[int], all_dataset_metadata: pd.DataFrame, file_path: str\n) -> Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Process the dataset metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    descriptions = [\n        extract_attribute(attr, \"description\") for attr in openml_data_object\n    ]\n    joined_qualities = [\n        join_attributes(attr, \"qualities\") for attr in openml_data_object\n    ]\n    joined_features = [join_attributes(attr, \"features\") for attr in openml_data_object]\n\n    all_data_description_df = create_combined_information_df(\n        data_id, descriptions, joined_qualities, joined_features\n    )\n    all_dataset_metadata = combine_metadata(\n        all_dataset_metadata, all_data_description_df\n    )\n\n    all_dataset_metadata.to_csv(file_path)\n\n    return (\n        all_dataset_metadata[[\"did\", \"name\", \"Combined_information\"]],\n        all_dataset_metadata,\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.process_flow_metadata","title":"process_flow_metadata(openml_data_object, data_id, file_path)","text":"

Description: Process the flow metadata.

Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path

Returns: The combined metadata dataframe and the updated metadata table.

Source code in serving/modules/metadata_utils.py
def process_flow_metadata(openml_data_object: Sequence[openml.flows.flow.OpenMLFlow], data_id: Sequence[int], file_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:\n    \"\"\"\n    Description: Process the flow metadata.\n\n    Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, file_path (str) : The file path\n\n    Returns: The combined metadata dataframe and the updated metadata table.\n    \"\"\"\n    descriptions = [\n        extract_attribute(attr, \"description\") for attr in openml_data_object\n    ]\n    names = [extract_attribute(attr, \"name\") for attr in openml_data_object]\n    tags = [extract_attribute(attr, \"tags\") for attr in openml_data_object]\n\n    all_data_description_df = pd.DataFrame(\n        {\n            \"did\": data_id,\n            \"description\": descriptions,\n            \"name\": names,\n            \"tags\": tags,\n        }\n    )\n\n    all_data_description_df[\"Combined_information\"] = all_data_description_df.apply(\n        merge_all_columns_to_string, axis=1\n    )\n    all_data_description_df.to_csv(file_path)\n\n    return (\n        all_data_description_df[[\"did\", \"name\", \"Combined_information\"]],\n        all_data_description_df,\n    )\n
"},{"location":"modules/metadata_module/#metadata_utils.save_metadata_to_file","title":"save_metadata_to_file(data, save_filename)","text":"

Save metadata to a file.

Source code in serving/modules/metadata_utils.py
def save_metadata_to_file(data, save_filename: str):\n    \"\"\"\n    Save metadata to a file.\n    \"\"\"\n    with open(save_filename, \"wb\") as f:\n        pickle.dump(data, f)\n
"},{"location":"modules/result_gen/","title":"Result gen","text":""},{"location":"modules/result_gen/#results_gen.aggregate_multiple_queries_and_count","title":"aggregate_multiple_queries_and_count(queries, qa_dataset, config, group_cols=['id', 'name'], sort_by='query')","text":"

Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results

Input

queries: List of queries group_cols: List of columns to group by

Returns: Combined dataframe with the results of all queries

Source code in serving/modules/results_gen.py
def aggregate_multiple_queries_and_count(\n    queries, qa_dataset, config, group_cols=[\"id\", \"name\"], sort_by=\"query\"\n) -> pd.DataFrame:\n    \"\"\"\n    Description: Aggregate the results of multiple queries into a single dataframe and count the number of times a dataset appears in the results\n\n    Input:\n        queries: List of queries\n        group_cols: List of columns to group by\n\n    Returns: Combined dataframe with the results of all queries\n    \"\"\"\n    combined_df = pd.DataFrame()\n    for query in tqdm(queries, total=len(queries)):\n        result_data_frame = get_result_from_query(\n            query=query, qa=qa_dataset, type_of_query=\"dataset\", config=config\n        )\n        result_data_frame = result_data_frame[group_cols]\n        # Concat with combined_df with a column to store the query\n        result_data_frame[\"query\"] = query\n        combined_df = pd.concat([combined_df, result_data_frame])\n    combined_df = (\n        combined_df.groupby(group_cols)\n        .count()\n        .reset_index()\n        .sort_values(by=sort_by, ascending=False)\n    )\n    return combined_df\n
"},{"location":"modules/result_gen/#results_gen.check_query","title":"check_query(query)","text":"

Description: Performs checks on the query - Replaces %20 with space character (browsers do this automatically when spaces are in the URL) - Removes leading and trailing spaces - Limits the query to 150 characters

Input: query (str)

Returns: None

Source code in serving/modules/results_gen.py
def check_query(query: str) -> str:\n    \"\"\"\n    Description: Performs checks on the query\n    - Replaces %20 with space character (browsers do this automatically when spaces are in the URL)\n    - Removes leading and trailing spaces\n    - Limits the query to 150 characters\n\n    Input: query (str)\n\n    Returns: None\n    \"\"\"\n    if query == \"\":\n        raise ValueError(\"Query cannot be empty.\")\n    query = query.replace(\n        \"%20\", \" \"\n    )  # replace %20 with space character (browsers do this automatically when spaces are in the URL)\n    # query = query.replace(\"dataset\", \"\")\n    # query = query.replace(\"flow\", \"\")\n    query = query.strip()\n    query = query[:200]\n    return query\n
"},{"location":"modules/result_gen/#results_gen.create_output_dataframe","title":"create_output_dataframe(dict_results, type_of_data, ids_order)","text":"

Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.

Input: dict_results (dict), type_of_data (str)

Returns: A dataframe with the results and duplicate names removed.

Source code in serving/modules/results_gen.py
def create_output_dataframe(dict_results: dict, type_of_data: str, ids_order: list) -> pd.DataFrame:\n    \"\"\"\n    Description: Create an output dataframe with the results. The URLs are API calls to the OpenML API for the specific type of data.\n\n    Input: dict_results (dict), type_of_data (str)\n\n    Returns: A dataframe with the results and duplicate names removed.\n    \"\"\"\n    output_df = pd.DataFrame(dict_results).T.reset_index()\n    # order the rows based on the order of the ids\n    output_df[\"index\"] = output_df[\"index\"].astype(int)\n    output_df = output_df.set_index(\"index\").loc[ids_order].reset_index()\n    # output_df[\"urls\"] = output_df[\"index\"].apply(\n    #     lambda x: f\"https://www.openml.org/api/v1/json/{type_of_data}/{x}\"\n    # )\n    # https://www.openml.org/search?type=data&sort=runs&status=any&id=31\n    output_df[\"urls\"] = output_df[\"index\"].apply(\n        lambda x: f\"https://www.openml.org/search?type={type_of_data}&id={x}\"\n    )\n    output_df[\"urls\"] = output_df[\"urls\"].apply(make_clickable)\n    # data = openml.datasets.get_dataset(\n    # get rows with unique names\n    if type_of_data == \"data\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"dataset = openml.datasets.get_dataset({x})\"\n        )\n    elif type_of_data == \"flow\":\n        output_df[\"command\"] = output_df[\"index\"].apply(\n            lambda x: f\"flow = openml.flows.get_flow({x})\"\n        )\n    output_df = output_df.drop_duplicates(subset=[\"name\"])\n    # order the columns\n    output_df = output_df[[\"index\", \"name\", \"command\", \"urls\", \"page_content\"]].rename(\n        columns={\"index\": \"id\", \"urls\": \"OpenML URL\", \"page_content\": \"Description\"}\n    )\n    return output_df\n
"},{"location":"modules/result_gen/#results_gen.fetch_results","title":"fetch_results(query, qa, type_of_query, config)","text":"

Description: Fetch results for the query using the QA chain.

Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)

Returns: results[\"source_documents\"] (list)

Source code in serving/modules/results_gen.py
def fetch_results(query: str, qa: langchain.chains.retrieval_qa.base.RetrievalQA, type_of_query: str, config: dict) -> Sequence[Document]:\n    \"\"\"\n    Description: Fetch results for the query using the QA chain.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str), config (dict)\n\n    Returns: results[\"source_documents\"] (list)\n    \"\"\"\n    results = qa.invoke(\n        input=query,\n        config={\"temperature\": config[\"temperature\"], \"top-p\": config[\"top_p\"]},\n    )\n    if config[\"long_context_reorder\"] == True:\n        results = long_context_reorder(results)\n    id_column = {\"dataset\": \"did\", \"flow\": \"id\", \"data\": \"did\"}\n    id_column = id_column[type_of_query]\n\n    if config[\"reranking\"] == True:\n        try:\n            print(\"[INFO] Reranking results...\")\n            ranker = Ranker(model_name=\"ms-marco-MiniLM-L-12-v2\", cache_dir=\"/tmp/\")\n            rerankrequest = RerankRequest(\n                query=query,\n                passages=[\n                    {\"id\": result.metadata[id_column], \"text\": result.page_content}\n                    for result in results\n                ],\n            )\n            ranking = ranker.rerank(rerankrequest)\n            ids = [result[\"id\"] for result in ranking]\n            ranked_results = [\n                result for result in results if result.metadata[id_column] in ids\n            ]\n            print(\"[INFO] Reranking complete.\")\n            return ranked_results\n        except Exception as e:\n            print(f\"[ERROR] Reranking failed: {e}\")\n            return results\n\n    else:\n        return results\n
"},{"location":"modules/result_gen/#results_gen.get_result_from_query","title":"get_result_from_query(query, qa, type_of_query, config)","text":"

Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.

Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)

Returns: output_df (pd.DataFrame)

Source code in serving/modules/results_gen.py
def get_result_from_query(query, qa, type_of_query, config) -> Tuple[pd.DataFrame, Sequence[Document]]:\n    \"\"\"\n    Description: Get the result from the query using the QA chain and return the results in a dataframe that is then sent to the frontend.\n\n    Input: query (str), qa (langchain.chains.retrieval_qa.base.RetrievalQA), type_of_query (str)\n\n    Returns: output_df (pd.DataFrame)\n    \"\"\"\n    if type_of_query == \"dataset\":\n        # Fixing the key_name for dataset because of the way the OpenML API returns the data\n        type_of_query = \"data\"\n    elif type_of_query == \"flow\":\n        type_of_query = \"flow\"\n    else:\n        raise ValueError(f\"Unsupported type_of_data: {type_of_query}\")\n\n    # Process the query\n    query = check_query(query)\n    if query == \"\":\n        return pd.DataFrame(), []\n    source_documents = fetch_results(\n        query, qa, config=config, type_of_query=type_of_query\n    )\n    dict_results, ids_order = process_documents(source_documents)\n    output_df = create_output_dataframe(dict_results, type_of_query, ids_order)\n\n    return output_df, source_documents\n
"},{"location":"modules/result_gen/#results_gen.long_context_reorder","title":"long_context_reorder(results)","text":"

Description: Lost in the middle reorder: the less relevant documents will be at the middle of the list and more relevant elements at beginning / end. See: https://arxiv.org/abs//2307.03172

Input: results (list)

Returns: reorder results (list)

Source code in serving/modules/results_gen.py
def long_context_reorder(results: Sequence[Document]) -> Sequence[Document]:\n    \"\"\"\n    Description: Lost in the middle reorder: the less relevant documents will be at the\n    middle of the list and more relevant elements at beginning / end.\n    See: https://arxiv.org/abs//2307.03172\n\n    Input: results (list)\n\n    Returns: reorder results (list)\n    \"\"\"\n    print(\"[INFO] Reordering results...\")\n    reordering = LongContextReorder()\n    results = reordering.transform_documents(results)\n    print(\"[INFO] Reordering complete.\")\n    return results\n
"},{"location":"modules/result_gen/#results_gen.make_clickable","title":"make_clickable(val)","text":"

Description: Make the URL clickable in the dataframe.

Source code in serving/modules/results_gen.py
def make_clickable(val : str) -> str:\n    \"\"\"\n    Description: Make the URL clickable in the dataframe.\n    \"\"\"\n    return '<a href=\"{}\">{}</a>'.format(val, val)\n
"},{"location":"modules/result_gen/#results_gen.process_documents","title":"process_documents(source_documents)","text":"

Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.

Input: source_documents (list), key_name (str)

Returns: dict_results (dict)

Source code in serving/modules/results_gen.py
def process_documents(source_documents : Sequence[Document]) -> Tuple[OrderedDict, list]:\n    \"\"\"\n    Description: Process the source documents and create a dictionary with the key_name as the key and the name and page content as the values.\n\n    Input: source_documents (list), key_name (str)\n\n    Returns: dict_results (dict)\n    \"\"\"\n    dict_results = OrderedDict()\n    for result in source_documents:\n        dict_results[result.metadata[\"did\"]] = {\n            \"name\": result.metadata[\"name\"],\n            \"page_content\": result.page_content,\n        }\n    ids = [result.metadata[\"did\"] for result in source_documents]\n    return dict_results, ids\n
"}]} \ No newline at end of file diff --git a/testing/index.html b/testing/index.html index afb4230..70a67d6 100644 --- a/testing/index.html +++ b/testing/index.html @@ -318,6 +318,15 @@ @@ -516,6 +570,15 @@ @@ -550,6 +658,10 @@

Testing

+

Unit Testing

+
    +
  • Run python -m unittest tests/unit_testing.py to run the unit tests.
  • +

Load Testing

  • Load testing can be done using Locust, a load testing tool that allows you to simulate users querying the API and measure the performance of the API under load from numerous users.
  • @@ -560,6 +672,253 @@

    Running the load testAll tests

    + + +
    + + + + +
    +

    + Bases: TestCase

    + + +
    + Source code in tests/unit_testing.py +
    15
    +16
    +17
    +18
    +19
    +20
    +21
    +22
    +23
    +24
    +25
    +26
    +27
    +28
    +29
    +30
    +31
    +32
    +33
    +34
    +35
    +36
    +37
    +38
    +39
    +40
    +41
    +42
    +43
    +44
    +45
    +46
    +47
    +48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    class TestConfig(unittest.TestCase):
    +    def __init__(self, *args, **kwargs):
    +        super().__init__(*args, **kwargs)
    +        self.client = chromadb.PersistentClient(path=config["persist_dir"])
    +        self.config_keys = ["rqa_prompt_template", "llm_prompt_template",
    +        "num_return_documents", "embedding_model", "llm_model", "num_documents_for_llm", "data_dir", "persist_dir", "testing_flag", "ignore_downloading_data", "test_subset_2000", "data_download_n_jobs", "training", "temperature", "top_p", "search_type", "reranking", "long_context_reorder"]
    +        self.query_test_dict = {
    +            "dataset": "Find me a dataset about flowers that has a high number of instances.",
    +            "flow": "Find me a flow that uses the RandomForestClassifier.",
    +        }
    +    def test_check_data_dirs(self):
    +        """
    +        Description: Check if the data directory exists.
    +        Returns: None
    +        """
    +        self.assertTrue(os.path.exists(config["data_dir"]))
    +        self.assertTrue(os.path.exists(config["persist_dir"]))
    +
    +    def test_config(self):
    +        """
    +        Description: Check if the config has the required keys.
    +        Returns: None
    +        """
    +        for key in self.config_keys:
    +            self.assertIn(key, config.keys())
    +
    +    def test_setup_vector_db_and_qa(self):
    +        """
    +        Description: Check if the setup_vector_db_and_qa function works as expected.
    +        Returns: None
    +        """
    +        for type_of_data in ["dataset", "flow"]:
    +            self.qa = setup_vector_db_and_qa(
    +                config=config, data_type=type_of_data, client=self.client
    +            )
    +            self.assertIsNotNone(self.qa)
    +            self.result_data_frame = get_result_from_query(
    +                query=self.query_test_dict[type_of_data],
    +                qa=self.qa,
    +                type_of_query=type_of_data,
    +                config=config,
    +            )
    +            self.assertIsNotNone(self.result_data_frame)
    +
    +
    + + + +
    + + + + + + + + + +
    + + +

    + test_check_data_dirs() + +

    + + +
    + +

    Description: Check if the data directory exists. +Returns: None

    + +
    + Source code in tests/unit_testing.py +
    25
    +26
    +27
    +28
    +29
    +30
    +31
    def test_check_data_dirs(self):
    +    """
    +    Description: Check if the data directory exists.
    +    Returns: None
    +    """
    +    self.assertTrue(os.path.exists(config["data_dir"]))
    +    self.assertTrue(os.path.exists(config["persist_dir"]))
    +
    +
    +
    + +
    + +
    + + +

    + test_config() + +

    + + +
    + +

    Description: Check if the config has the required keys. +Returns: None

    + +
    + Source code in tests/unit_testing.py +
    33
    +34
    +35
    +36
    +37
    +38
    +39
    def test_config(self):
    +    """
    +    Description: Check if the config has the required keys.
    +    Returns: None
    +    """
    +    for key in self.config_keys:
    +        self.assertIn(key, config.keys())
    +
    +
    +
    + +
    + +
    + + +

    + test_setup_vector_db_and_qa() + +

    + + +
    + +

    Description: Check if the setup_vector_db_and_qa function works as expected. +Returns: None

    + +
    + Source code in tests/unit_testing.py +
    41
    +42
    +43
    +44
    +45
    +46
    +47
    +48
    +49
    +50
    +51
    +52
    +53
    +54
    +55
    +56
    +57
    def test_setup_vector_db_and_qa(self):
    +    """
    +    Description: Check if the setup_vector_db_and_qa function works as expected.
    +    Returns: None
    +    """
    +    for type_of_data in ["dataset", "flow"]:
    +        self.qa = setup_vector_db_and_qa(
    +            config=config, data_type=type_of_data, client=self.client
    +        )
    +        self.assertIsNotNone(self.qa)
    +        self.result_data_frame = get_result_from_query(
    +            query=self.query_test_dict[type_of_data],
    +            qa=self.qa,
    +            type_of_query=type_of_data,
    +            config=config,
    +        )
    +        self.assertIsNotNone(self.result_data_frame)
    +
    +
    +
    + +
    + + + +
    + +
    + +