diff --git a/.circleci/config.yml b/.circleci/config.yml index ed5a05f5e9..26200eb04e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,6 +15,12 @@ workflows: name: Linter | v3.10 | Linux mode: LINTER ################################ + ## DOC: All + ################################ + ################################ + - Doc: + name: Doc | v3.10 | Linux + ################################ #### PIP: Master ################################ ################################ @@ -94,6 +100,17 @@ workflows: - master - staging ################################ + ### THIRD PARTY: Staging, Master + ################################ + ################################ + - MySQL: + name: Third Party Test - MySQL | v3.10 | Linux + filters: + branches: + only: + - master + - staging + ################################ ### NOTEBOOKS: Staging, Master ################################ ################################ @@ -178,8 +195,8 @@ jobs: RAY: << parameters.ray >> command: | "python<< parameters.v >>" -m venv test_evadb - pip install --upgrade pip source test_evadb/bin/activate + pip install --upgrade pip if [ $RAY = "ENABLED" ]; then if [ $PY_VERSION != "3.11" ]; then pip install ".[dev,ray,qdrant]" @@ -225,6 +242,101 @@ jobs: paths: - test_evadb + Doc: + parameters: + v: + type: string + default: "3.10" + mode: + type: string + default: "DOC" + resource_class: large + docker: + # https://circleci.com/docs/circleci-images#language-image-variants + - image: "cimg/python:<< parameters.v >>-node" + steps: + + - checkout + + # Restore pip wheel + - restore_cache: + keys: + - v1-pip-wheel_cache-python<< parameters.v >>-{{ checksum "docs/requirements.txt" }}-{{ checksum "package-lock.json" }} + + - run: + name: Install EvaDB Doc dependencies + command: | + "python<< parameters.v >>" -m venv test_evadb_doc + source test_evadb_doc/bin/activate + pip install --upgrade pip + pip install -r docs/requirements.txt + npm install markdown-link-check + + - run: + name: Test doc build and link validation + no_output_timeout: 10m # 10 minute timeout + command: | + source test_evadb_doc/bin/activate + bash script/test/test.sh -m "<< parameters.mode >>" + + - save_cache: + key: v1-pip-wheel_cache-python<< parameters.v >>-{{ checksum "docs/requirements.txt" }}-{{ checksum "package-lock.json" }} + paths: + - test_evadb_doc + - node_modules + + MySQL: + parameters: + v: + type: string + default: "3.10" + resource_class: large + docker: + - image: "cimg/python:<< parameters.v >>" + - image: "cimg/mysql:8.0" + environment: + MYSQL_USER: eva + MYSQL_PASSWORD: password + MYSQL_DATABASE: evadb + + steps: + + - checkout + + # Restore pip wheel + - restore_cache: + keys: + - v1-pip-wheel_cache-python<< parameters.v >>-rayDISABLED-{{ checksum "setup.py" }} + + - restore_cache: + keys: + - v1-model_cache-{{ checksum "setup.py" }} + + - run: + name: Install dockerize + command: wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz + environment: + DOCKERIZE_VERSION: v0.6.1 + + - run: + name: Wait for DB to run + command : dockerize -wait tcp://localhost:3306 -timeout 1m + + - run: + name: Install EvaDB package from GitHub repo with all dependencies + command: | + "python<< parameters.v >>" -m venv test_evadb + pip install --upgrade pip + source test_evadb/bin/activate + pip install ".[dev]" + pip install -r evadb/third_party/databases/mysql/requirements.txt + + - run: + name: Run integration tests + command: | + source test_evadb/bin/activate + PYTHONPATH="." python -m pytest test/third_party_tests/test_native_executor.py -k test_should_run_query_in_mysql + Postgres: parameters: v: @@ -243,6 +355,11 @@ jobs: - checkout + # Restore pip wheel + - restore_cache: + keys: + - v1-pip-wheel_cache-python<< parameters.v >>-rayDISABLED-{{ checksum "setup.py" }} + - restore_cache: keys: - v1-model_cache-{{ checksum "setup.py" }} @@ -264,6 +381,7 @@ jobs: pip install --upgrade pip source test_evadb/bin/activate pip install ".[dev]" + pip install -r evadb/third_party/databases/postgres/requirements.txt - run: name: Run integration tests @@ -317,7 +435,7 @@ jobs: source test_evadb/bin/activate pip install --upgrade pip pip debug --verbose - pip install ".[dev,ludwig,qdrant]" + pip install ".[dev,ludwig,qdrant,forecasting]" source test_evadb/bin/activate bash script/test/test.sh -m "<< parameters.mode >>" diff --git a/.gitignore b/.gitignore index b9a2309a68..1a091b9894 100644 --- a/.gitignore +++ b/.gitignore @@ -205,8 +205,13 @@ dep.txt *eva_data/ *evadb_data/ -# models +# models, but not apply to codebase models/ +!evadb/models +!evadb/catalog/models +!test/unit_tests/models +!test/unit_tests/catalog/models + # test files test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1750e11127..54502ff89c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### [Deprecated] ### [Removed] +## [0.3.3] - 2023-08-29 + +* PR #983: doc: fix nav bar +* PR #982: fix: batch merge causing redundant row +* PR #981: fix: use the same interface +* PR #979: docs: added logo +* PR #980: docs: Update README.md +* PR #975: Simplify the ludwig dependency +* PR #972: feat: improve dev doc +* PR #971: Revert "feat: Integrating thirdy party Slack API " +* PR #967: feat: Integrating thirdy party Slack API +* PR #966: Developer guide for new structure data source integration +* PR #949: feat: improve circle ci +* PR #946: Support `SELECT Func + ## [0.3.2] - 2023-08-25 * PR #953: docs: Fix User Reference and Dev Guide diff --git a/README.md b/README.md index a6ea5572b2..b759853950 100644 --- a/README.md +++ b/README.md @@ -1,238 +1,252 @@ -# EvaDB: Database System for AI Apps - -

- EvaDB + EvaDB

-# - -
- - Check out EvaDB on Colab - - - Slack - - - Twitter - - - Roadmap - -
- PyPI - License - Coverage Status - - Python Versions -
+

Bring AI inside your database system and build AI-powered apps

+ +

+ +EvaDB forks + + + +EvaDB stars + + +EvaDB pull-requests + + + + + + +EvaDB Commits + +

+ +

Follow EvaDB

+ +

+ +Join EvaDB Slack Community + + +Follow evadb_ai + + +EvaDB on Medium + +EvaDB Website + +

-

EvaDB is a database system for building simpler and faster AI-powered applications.

+

Share EvaDB

-EvaDB is a database system for developing AI apps. We aim to simplify the development and deployment of AI apps that operate on unstructured data (text documents, videos, PDFs, podcasts, etc.) and structured data (tables, vector index). +

-The high-level Python and SQL APIs allow beginners to use EvaDB in a few lines of code. Advanced users can define custom user-defined functions that wrap around any AI model or Python library. EvaDB is fully implemented in Python and licensed under an Apache license. + +Follow _superAGI +Share on Telegram + +Share on Reddit +

+ +

+ + Launch EvaDB on Colab + + + Roadmap + + Python Versions Supported + License + Coverage Status + +
+ Open in Gitpod +

+ +EvaDB enables software developers to build AI apps in a few lines of code. Its powerful SQL API simplifies AI app development for both structured and unstructured data. EvaDB's benefits include: +- 🔮 Easy to connect EvaDB with your SQL database system and build AI-powered apps with SQL queries +- 🤝 Query your data with a pre-trained AI model from Hugging Face, OpenAI, YOLO, PyTorch, and other AI frameworks +- ⚡️ Faster queries thanks to AI-centric query optimization +- 💰 Save money spent on running models by efficient CPU/GPU use +- 🔧 Fine-tune your AI models to achieve better results + +👋 Hey! If you're excited about our vision of bringing AI inside database systems, show some ❤️ by: + ## Quick Links -- [Features](#features) -- [Quick Start](#quick-start) +- [Quick Links](#quick-links) - [Documentation](#documentation) +- [Why EvaDB](#why-evadb) +- [How does EvaDB work](#how-does-evadb-work) +- [Illustrative Queries](#illustrative-queries) +- [Illustrative Apps](#illustrative-apps) +- [More Illustrative Queries](#more-illustrative-queries) +- [Architecture of EvaDB](#architecture-of-evadb) - [Community and Support](#community-and-support) -- [Twitter](https://twitter.com/evadb_ai) +- [Contributing](#contributing) +- [Star History](#star-history) +- [License](#license) -## Features - -- 🔮 Build simpler AI-powered apps using Python functions or SQL queries -- ⚡️ 10x faster applications using AI-centric query optimization -- 💰 Save money spent on inference -- 🚀 First-class support for your custom deep learning models through user-defined functions -- 📦 Built-in caching to eliminate redundant model invocations across queries -- ⌨️ Integrations for PyTorch, Hugging Face, YOLO, and Open AI models -- 🐍 Installable via pip and fully implemented in Python +## Documentation -## Illustrative Applications +You can find the complete documentation of EvaDB at [evadb.ai/docs](https://evadb.ai/docs/) 📚✨🚀 -Here are some illustrative AI apps built using EvaDB (each notebook can be opened on Google Colab): +## Why EvaDB + +In the world of AI, we've reached a stage where many AI tasks that were traditionally handled by AI or ML engineers can now be automated. EvaDB enables software developers with the ability to perform advanced AI tasks without needing to delve into the intricate details. - * 🔮 PrivateGPT - * 🔮 ChatGPT-based Video Question Answering - * 🔮 Querying PDF Documents - * 🔮 Analysing Traffic Flow with YOLO - * 🔮 Examining Emotions of Movie - * 🔮 Image Segmentation with Hugging Face +EvaDB covers many AI applications, including regression, classification, image recognition, question answering, and many other generative AI applications. EvaDB targets 99% of AI problems that are often repetitive and can be automated with a simple function call in an SQL query. Until now, there is no comprehensive open-source framework for bringing AI into an existing SQL database system with a principled AI optimization framework, and that's where EvaDB comes in. -## Documentation +Our target audience is software developers who may not necessarily have a background in AI but require AI capabilities to solve specific problems. We target programmers who write simple SQL queries inside their CRUD apps. With EvaDB, it is possible to easily add AI features to these apps by calling built-in AI functions in the queries. -* [Documentation](https://evadb.readthedocs.io/) - - The Getting Started page shows how you can use EvaDB for different AI tasks and how you can easily extend EvaDB to support your custom deep learning model through user-defined functions. - - The User Guides section contains Jupyter Notebooks that demonstrate how to use various features of EvaDB. Each notebook includes a link to Google Colab, where you can run the code yourself. -* [Join us on Slack](https://join.slack.com/t/eva-db/shared_invite/zt-1i10zyddy-PlJ4iawLdurDv~aIAq90Dg) -* [Follow us on Twitter](https://twitter.com/evadb_ai) -* [Roadmap](https://github.com/orgs/georgia-tech-db/projects/3) +## How does EvaDB work -## Quick Start +
+ + +Follow the [getting started](https://evadb.readthedocs.io/en/stable/source/overview/getting-started.html) guide to get on-boarded as fast as possible. +
-- Step 1: Install EvaDB using `pip`. EvaDB supports Python versions >= `3.8`: +## Illustrative Queries -```shell -pip install evadb -``` +* Run the MNIST Image Classification model to obtain digit labels for each frame in the video. -- Step 2: It's time to write an AI app. - -```python -import evadb - -# Grab a EvaDB cursor to load data into tables and run AI queries -cursor = evadb.connect().cursor() - -# Load a collection of news videos into the 'news_videos' table -# This function returns a Pandas dataframe with the query's output -# In this case, the output dataframe indicates the number of loaded videos -cursor.load( - file_regex="news_videos/*.mp4", - format="VIDEO", - table_name="news_videos" -).df() - -# Define a function that wraps around your deep learning model -# Here, this function wraps around a speech-to-text model -# After registering the function, we can use the registered function in subsequent queries -cursor.create_function( - udf_name="SpeechRecognizer", - type="HuggingFace", - task='automatic-speech-recognition', - model='openai/whisper-base' -).df() - -# EvaDB automatically extracts the audio from the video -# We only need to run the SpeechRecongizer function on the 'audio' column -# to get the transcript and persist it in a table called 'transcripts' -cursor.query( - """CREATE TABLE transcripts AS - SELECT SpeechRecognizer(audio) from news_videos;""" -).df() - -# We next incrementally construct the ChatGPT query using EvaDB's Python API -# The query is based on the 'transcripts' table -# This table has a column called 'text' with the transcript text -query = cursor.table('transcripts') - -# Since ChatGPT is a built-in function, we don't have to define it -# We can just directly use it in the query -# We need to set the OPENAI_KEY as an environment variable -os.environ["OPENAI_KEY"] = OPENAI_KEY -query = query.select("ChatGPT('Is this video summary related to LLMs', text)") - -# Finally, we run the query to get the results as a dataframe -# You can then post-process the dataframe using other Python libraries -response = query.df() +```sql +SELECT MnistImageClassifier(data).label FROM mnist_video; ``` -- **Incrementally build an AI query that chains together multiple models** +* Build a vector index on the feature embeddings returned by the SIFT Feature Extractor on a collection of Reddit images. -Here is a AI query that analyses emotions of actors in an `Interstellar` movie clip using multiple PyTorch models. +```sql +CREATE INDEX reddit_sift_image_index + ON reddit_dataset (SiftFeatureExtractor(data)) + USING FAISS +``` -```python -# Access the Interstellar movie clip table using a cursor -query = cursor.table("Interstellar") -# Get faces using a `FaceDetector` function -query = query.cross_apply("UNNEST(FaceDetector(data))", "Face(bounding_box, confidence)") -# Focus only on frames 100 through 200 in the clip -query = query.filter("id > 100 AND id < 200") -# Get the emotions of the detected faces using a `EmotionDetector` function -query = query.select("id, bbox, EmotionDetector(Crop(data, bounding_box))") +* Retrieve the top-5 most similar images for the given image using the index. -# Run the query and get the query result as a dataframe -# At each of the above steps, you can run the query and see the output -# If you are familiar with SQL, you can get the SQL query with query.sql_query() -response = query.df() +```sql +SELECT name FROM reddit_dataset ORDER BY + Similarity( + SiftFeatureExtractor(Open('reddit-images/g1074_d4mxztt.jpg')), + SiftFeatureExtractor(data) + ) + LIMIT 5 ``` -- **EvaDB runs AI apps 10x faster using its AI-centric query optimizer**. - - Three key built-in optimizations are: +## Illustrative Apps - 💾 **Caching**: EvaDB automatically caches and reuses model inference results. +Here are some illustrative AI apps built using EvaDB (each notebook can be opened on Google Colab): - ⚡️ **Parallel Query Execution**: EvaDB runs the app in parallel on all the available hardware resources (CPUs and GPUs). + * 🔮 PrivateGPT + * 🔮 ChatGPT-based Video Question Answering + * 🔮 Querying PDF Documents + * 🔮 Analysing Traffic Flow with YOLO + * 🔮 Examining Emotions of Movie + * 🔮 Image Segmentation with Hugging Face - 🎯 **Model Ordering**: EvaDB optimizes the order in which models are evaluated (e.g., runs the faster, more selective model first). -## Architecture Diagram +## More Illustrative Queries -This diagram presents the key components of EvaDB. EvaDB's AI-centric query optimizer takes a query as input and generates a query plan that is executed by the query engine. The query engine hits the relevant storage engines to quickly retrieve the data required for efficiently running the query: -1. Structured data (SQL database system connected via `sqlalchemy`). -2. Unstructured media data (PDFs, videos, etc. on cloud/local filesystem). -3. Feature data (vector database system). +
-Architecture Diagram +* Store the text returned by a Speech Recognition model on the audio component of a video in a table. -## Screenshots +```sql +CREATE TABLE text_summary AS + SELECT SpeechRecognizer(audio) FROM ukraine_video; +``` -### 🔮 [Traffic Analysis](https://evadb.readthedocs.io/en/stable/source/tutorials/02-object-detection.html) (Object Detection Model) -| Source Video | Query Result | -|---------------|--------------| -|Source Video |Query Result | +* Run ChatGPT on the `text` column in a table. -### 🔮 [PDF Question Answering](https://evadb.readthedocs.io/en/stable/source/tutorials/12-query-pdf.html) (Question Answering Model) +```sql +SELECT ChatGPT('Is this video summary related to Ukraine russia war', text) + FROM text_summary; +``` -| App | -|-----| -|Source Video | +* Train an ML model using the Ludwig AI engine to predict a column in a table. -### 🔮 [MNIST Digit Recognition](https://evadb.readthedocs.io/en/stable/source/tutorials/01-mnist.html) (Image Classification Model) -| Source Video | Query Result | -|---------------|--------------| -|Source Video |Query Result | +```sql +CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM +( SELECT * FROM HomeRentals ) +TYPE Ludwig +PREDICT 'rental_price' +TIME_LIMIT 120; +``` -### 🔮 [Movie Emotion Analysis](https://evadb.readthedocs.io/en/stable/source/tutorials/03-emotion-analysis.html) (Face Detection + Emotion Classification Models) +
-| Source Video | Query Result | -|---------------|--------------| -|Source Video |Query Result | +## Architecture of EvaDB -### 🔮 [License Plate Recognition](https://github.com/georgia-tech-db/evadb-application-template) (Plate Detection + OCR Extraction Models) +
+EvaDB's AI-centric query optimizer takes a query as input and generates a query plan. The query engine takes the query plan and hits the relevant backends to efficiently process the query: +1. SQL Database Systems (Structured Data) +2. AI Frameworks (Transform Unstructured Data to Structured Data, Unstructured data includes PDFs, images, podcasts, etc. stored on cloud buckets or local filesystem) +3. Vector Database Systems (Feature Embeddings) -| Query Result | -|--------------| -Query Result | +

+ Architecture Diagram +

+
## Community and Support -👋 If you have general questions about EvaDB, want to say hello or just follow along, please join our [Slack Community](https://join.slack.com/t/eva-db/shared_invite/zt-1i10zyddy-PlJ4iawLdurDv~aIAq90Dg) and [follow us on Twitter](https://twitter.com/evadb_ai). - - - EvaDB Slack Channel - + -If you run into any problems or issues, please create a Github issue. +If you run into any bugs or have any comments, you can reach us on our Slack Community 📟 or create a [Github Issue :bug:](https://github.com/georgia-tech-db/evadb/issues). -Don't see a feature in the list? Search our issue tracker if someone has already requested it and add a comment to it explaining your use-case, or open a new issue if not. We prioritize our [roadmap](https://github.com/orgs/georgia-tech-db/projects/3) based on user feedback, so we'd love to hear from you. +Here is EvaDB's public [roadmap 🛤️](https://github.com/orgs/georgia-tech-db/projects/3). We prioritize features based on user feedback, so we'd love to hear from you! ## Contributing -[![PyPI Version](https://img.shields.io/pypi/v/evadb.svg)](https://pypi.org/project/evadb) -[![CI Status](https://circleci.com/gh/georgia-tech-db/evadb.svg?style=svg)](https://circleci.com/gh/georgia-tech-db/evadb) -[![Documentation Status](https://readthedocs.org/projects/evadb/badge/?version=latest)](https://evadb.readthedocs.io/en/latest/index.html) +We are a lean team on a mission to bring AI inside database systems! All kinds of contributions to EvaDB are appreciated 🙌 If you'd like to get involved, here's information on where we could use your help: [contribution guide](https://evadb.readthedocs.io/en/latest/source/dev-guide/contribute.html) 🤗 -EvaDB is the beneficiary of many [contributors](https://github.com/georgia-tech-db/evadb/graphs/contributors). All kinds of contributions to EvaDB are appreciated. To file a bug or to request a feature, please use GitHub issues. Pull requests are welcome. +

+ + + +

+ +
+ CI Status: -For more information, see our -[contribution guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html). +[![CI Status](https://circleci.com/gh/georgia-tech-db/evadb.svg?style=svg)](https://circleci.com/gh/georgia-tech-db/evadb) +[![Documentation Status](https://readthedocs.org/projects/evadb/badge/?version=latest)](https://evadb.readthedocs.io/en/latest/index.html) +
## Star History - - EvaDB Star History Chart - +

+ + EvaDB Star History Chart + +

## License -Copyright (c) 2018--present [Georgia Tech Database Group](http://db.cc.gatech.edu/). -Licensed under [Apache License](LICENSE). +Copyright (c) [Georgia Tech Database Group](http://db.cc.gatech.edu/). +Licensed under an [Apache License](LICENSE.txt). diff --git a/benchmark/text_summarization/text_summarization_with_evadb.py b/benchmark/text_summarization/text_summarization_with_evadb.py index 7e788a0882..f5e7e5237f 100644 --- a/benchmark/text_summarization/text_summarization_with_evadb.py +++ b/benchmark/text_summarization/text_summarization_with_evadb.py @@ -16,10 +16,10 @@ cursor.query("DROP UDF IF EXISTS TextSummarizer;").df() cursor.query("""CREATE UDF IF NOT EXISTS TextSummarizer TYPE HuggingFace - 'task' 'summarization' - 'model' 'sshleifer/distilbart-cnn-12-6' - 'min_length' 5 - 'max_length' 100;""").df() + TASK 'summarization' + MODEL 'sshleifer/distilbart-cnn-12-6' + MIN_LENGTH 5 + MAX_LENGTH 100;""").df() cursor.query("DROP TABLE IF EXISTS cnn_news_summary;").df() diff --git a/data/forecasting/air-passengers.csv b/data/forecasting/air-passengers.csv new file mode 100644 index 0000000000..c8e1fb2ba1 --- /dev/null +++ b/data/forecasting/air-passengers.csv @@ -0,0 +1,145 @@ +unique_id,ds,y +AirPassengers,1949-01-01,112 +AirPassengers,1949-02-01,118 +AirPassengers,1949-03-01,132 +AirPassengers,1949-04-01,129 +AirPassengers,1949-05-01,121 +AirPassengers,1949-06-01,135 +AirPassengers,1949-07-01,148 +AirPassengers,1949-08-01,148 +AirPassengers,1949-09-01,136 +AirPassengers,1949-10-01,119 +AirPassengers,1949-11-01,104 +AirPassengers,1949-12-01,118 +AirPassengers,1950-01-01,115 +AirPassengers,1950-02-01,126 +AirPassengers,1950-03-01,141 +AirPassengers,1950-04-01,135 +AirPassengers,1950-05-01,125 +AirPassengers,1950-06-01,149 +AirPassengers,1950-07-01,170 +AirPassengers,1950-08-01,170 +AirPassengers,1950-09-01,158 +AirPassengers,1950-10-01,133 +AirPassengers,1950-11-01,114 +AirPassengers,1950-12-01,140 +AirPassengers,1951-01-01,145 +AirPassengers,1951-02-01,150 +AirPassengers,1951-03-01,178 +AirPassengers,1951-04-01,163 +AirPassengers,1951-05-01,172 +AirPassengers,1951-06-01,178 +AirPassengers,1951-07-01,199 +AirPassengers,1951-08-01,199 +AirPassengers,1951-09-01,184 +AirPassengers,1951-10-01,162 +AirPassengers,1951-11-01,146 +AirPassengers,1951-12-01,166 +AirPassengers,1952-01-01,171 +AirPassengers,1952-02-01,180 +AirPassengers,1952-03-01,193 +AirPassengers,1952-04-01,181 +AirPassengers,1952-05-01,183 +AirPassengers,1952-06-01,218 +AirPassengers,1952-07-01,230 +AirPassengers,1952-08-01,242 +AirPassengers,1952-09-01,209 +AirPassengers,1952-10-01,191 +AirPassengers,1952-11-01,172 +AirPassengers,1952-12-01,194 +AirPassengers,1953-01-01,196 +AirPassengers,1953-02-01,196 +AirPassengers,1953-03-01,236 +AirPassengers,1953-04-01,235 +AirPassengers,1953-05-01,229 +AirPassengers,1953-06-01,243 +AirPassengers,1953-07-01,264 +AirPassengers,1953-08-01,272 +AirPassengers,1953-09-01,237 +AirPassengers,1953-10-01,211 +AirPassengers,1953-11-01,180 +AirPassengers,1953-12-01,201 +AirPassengers,1954-01-01,204 +AirPassengers,1954-02-01,188 +AirPassengers,1954-03-01,235 +AirPassengers,1954-04-01,227 +AirPassengers,1954-05-01,234 +AirPassengers,1954-06-01,264 +AirPassengers,1954-07-01,302 +AirPassengers,1954-08-01,293 +AirPassengers,1954-09-01,259 +AirPassengers,1954-10-01,229 +AirPassengers,1954-11-01,203 +AirPassengers,1954-12-01,229 +AirPassengers,1955-01-01,242 +AirPassengers,1955-02-01,233 +AirPassengers,1955-03-01,267 +AirPassengers,1955-04-01,269 +AirPassengers,1955-05-01,270 +AirPassengers,1955-06-01,315 +AirPassengers,1955-07-01,364 +AirPassengers,1955-08-01,347 +AirPassengers,1955-09-01,312 +AirPassengers,1955-10-01,274 +AirPassengers,1955-11-01,237 +AirPassengers,1955-12-01,278 +AirPassengers,1956-01-01,284 +AirPassengers,1956-02-01,277 +AirPassengers,1956-03-01,317 +AirPassengers,1956-04-01,313 +AirPassengers,1956-05-01,318 +AirPassengers,1956-06-01,374 +AirPassengers,1956-07-01,413 +AirPassengers,1956-08-01,405 +AirPassengers,1956-09-01,355 +AirPassengers,1956-10-01,306 +AirPassengers,1956-11-01,271 +AirPassengers,1956-12-01,306 +AirPassengers,1957-01-01,315 +AirPassengers,1957-02-01,301 +AirPassengers,1957-03-01,356 +AirPassengers,1957-04-01,348 +AirPassengers,1957-05-01,355 +AirPassengers,1957-06-01,422 +AirPassengers,1957-07-01,465 +AirPassengers,1957-08-01,467 +AirPassengers,1957-09-01,404 +AirPassengers,1957-10-01,347 +AirPassengers,1957-11-01,305 +AirPassengers,1957-12-01,336 +AirPassengers,1958-01-01,340 +AirPassengers,1958-02-01,318 +AirPassengers,1958-03-01,362 +AirPassengers,1958-04-01,348 +AirPassengers,1958-05-01,363 +AirPassengers,1958-06-01,435 +AirPassengers,1958-07-01,491 +AirPassengers,1958-08-01,505 +AirPassengers,1958-09-01,404 +AirPassengers,1958-10-01,359 +AirPassengers,1958-11-01,310 +AirPassengers,1958-12-01,337 +AirPassengers,1959-01-01,360 +AirPassengers,1959-02-01,342 +AirPassengers,1959-03-01,406 +AirPassengers,1959-04-01,396 +AirPassengers,1959-05-01,420 +AirPassengers,1959-06-01,472 +AirPassengers,1959-07-01,548 +AirPassengers,1959-08-01,559 +AirPassengers,1959-09-01,463 +AirPassengers,1959-10-01,407 +AirPassengers,1959-11-01,362 +AirPassengers,1959-12-01,405 +AirPassengers,1960-01-01,417 +AirPassengers,1960-02-01,391 +AirPassengers,1960-03-01,419 +AirPassengers,1960-04-01,461 +AirPassengers,1960-05-01,472 +AirPassengers,1960-06-01,535 +AirPassengers,1960-07-01,622 +AirPassengers,1960-08-01,606 +AirPassengers,1960-09-01,508 +AirPassengers,1960-10-01,461 +AirPassengers,1960-11-01,390 +AirPassengers,1960-12-01,432 diff --git a/docs/Makefile b/docs/Makefile index 6c27cca707..c17f086e1e 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -3,7 +3,7 @@ # You can set these variables from the command line, and also # from the environment for the first two. -SPHINXOPTS ?= -vvv +SPHINXOPTS ?= -vvv -W SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build @@ -21,4 +21,4 @@ help: clean: rm -rf $(BUILDDIR)/* - rm -rf ./source/reference/python/doc/* \ No newline at end of file + rm -rf ./source/reference/python/doc/* diff --git a/docs/Readme.md b/docs/README.md similarity index 76% rename from docs/Readme.md rename to docs/README.md index fa252de75d..7bd445ca80 100644 --- a/docs/Readme.md +++ b/docs/README.md @@ -8,9 +8,8 @@ make html open _build/html/index.html ``` -To test links: +To further test external links: ``` -cd evadb/docs -sphinx-build . _build -b linkcheck -``` \ No newline at end of file +make linkcheck +``` diff --git a/docs/_toc.yml b/docs/_toc.yml index ae1a4950f4..927d131973 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -5,34 +5,40 @@ parts: chapters: - file: source/overview/getting-started sections: - - file: source/overview/getting-started/install-guide - title: Installation Guide + - file: source/overview/getting-started/installation-options + title: Installation Options + - file: source/overview/connect-to-database + title: Connect to Database - file: source/overview/concepts - # - file: source/overview/faq + title: Concepts + sections: + - file: source/overview/concepts/data-sources + title: Data Sources + #- file: source/overview/faq - caption: Use Cases chapters: + - file: source/usecases/sentiment-analysis.rst + title: Sentiment Analysis + - file: source/usecases/question-answering.rst + title: Question Answering + - file: source/usecases/text-summarization.rst + title: Text Summarization - file: source/usecases/image-classification.rst title: Image Classification - - file: source/usecases/similar-image-search.rst - title: Image Search [FAISS] - - file: source/usecases/qa-video.rst - title: Q&A from Videos [ChatGPT + HuggingFace] - - file: source/usecases/02-object-detection.ipynb + - file: source/usecases/image-search.rst + title: Image Search + - file: source/usecases/object-detection.rst title: Object Detection - - file: source/usecases/03-emotion-analysis.ipynb + - file: source/usecases/emotion-analysis.rst title: Emotion Analysis - - file: source/usecases/07-object-segmentation-huggingface.ipynb - title: Image Segmentation [HuggingFace] - + - file: source/usecases/privategpt.rst + title: PrivateGPT - caption: User Reference chapters: - - file: source/reference/api - title: Python API - - file: source/reference/evaql - title: Eva Query Language + title: EvaQL sections: - file: source/reference/evaql/load - file: source/reference/evaql/select @@ -43,22 +49,34 @@ parts: - file: source/reference/evaql/insert - file: source/reference/evaql/delete - file: source/reference/evaql/rename + - file: source/reference/evaql/use - - file: source/reference/udfs/index - title: Models + - file: source/reference/api + title: Python API + + - file: source/reference/databases/index + title: Data Sources + sections: + - file: source/reference/databases/postgres + - file: source/reference/databases/sqlite + - file: source/reference/databases/mysql + + - file: source/reference/ai/index + title: AI Engines sections: - - file: source/reference/udfs/hf + - file: source/reference/ai/model-train + title: Model Training + - file: source/reference/ai/model-forecasting + title: Time Series Forecasting + - file: source/reference/ai/hf title: Hugging Face - - file: source/reference/udfs/openai + - file: source/reference/ai/openai title: OpenAI - - file: source/reference/udfs/yolo + - file: source/reference/ai/yolo title: YOLO - - file: source/reference/udfs/custom + - file: source/reference/ai/custom title: Custom Model - - file: source/reference/udfs/model-train - title: Model Train/Finetune - # - file: source/reference/io # title: IO Descriptors @@ -73,17 +91,14 @@ parts: - file: source/benchmarks/text_summarization.rst title: Text Summarization - - caption: Developer Guide + - caption: Developer Reference chapters: - - file: source/dev-guide/architecture - title: Architecture Design of EvaDB - - file: source/dev-guide/contribute title: Contributing to EvaDB sections: - file: source/dev-guide/contribute/setup-dev title: Setup Environment - - file: source/contribute/contribute/testing + - file: source/dev-guide/contribute/testing title: Testing - file: source/dev-guide/contribute/submit-pr title: Submit a PR @@ -98,7 +113,7 @@ parts: - file: source/dev-guide/debugger/vscode-debugger title: VSCode Debugger - file: source/dev-guide/debugger/alternative - title: Alternaitve Debugger + title: Alternative Debugger - file: source/dev-guide/extend title: Extending EvaDB @@ -114,4 +129,7 @@ parts: - file: source/dev-guide/release/pypi-account title: Setup PyPI Account - file: source/dev-guide/release/release-steps - title: How to Release + title: Release Guide + + - file: source/dev-guide/architecture + title: Architecture Diagram diff --git a/docs/conf.py b/docs/conf.py index 601fd64abe..644db7510f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,7 +31,7 @@ "sphinx_copybutton", "sphinx.ext.doctest", "sphinx.ext.coverage", - "sphinx.ext.autosectionlabel", +# "sphinx.ext.autosectionlabel", "sphinx.ext.autosummary", "sphinx.ext.autodoc", "sphinx.ext.autodoc.typehints", @@ -48,6 +48,9 @@ "versionwarning.extension", "IPython.sphinxext.ipython_console_highlighting", ] + +suppress_warnings = ["etoc.toctree", "myst.header"] + source_suffix = [".ipynb", ".html", ".md", ".rst"] autodoc_pydantic_model_show_json = False @@ -91,7 +94,8 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md", "images/reference/README.md"] + # The name of the Pygments (syntax highlighting) style to use. # pygments_style = "lovelace" @@ -148,14 +152,10 @@ "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/solid.min.css", "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/brands.min.css"] -# Adding the Tutorial notebooks to ./docs/source/tutorials/ - -for i in os.listdir("../tutorials"): - if i.endswith(".ipynb"): - shutil.copy(f"../tutorials/{i}", "./source/usecases/") - -jupyter_execute_notebooks = "off" - +# Check link: https://stackoverflow.com/questions/14492743/have-sphinx-report-broken-links/14735060#14735060 +nitpicky = True +# BUG: https://stackoverflow.com/questions/11417221/sphinx-autodoc-gives-warning-pyclass-reference-target-not-found-type-warning +nitpick_ignore_regex = [('py:class', r'.*')] # -- Initialize Sphinx ---------------------------------------------- def setup(app): @@ -165,4 +165,4 @@ def setup(app): message=r".*Container node skipped.*", ) # Custom JS - app.add_js_file("js/top-navigation.js", defer="defer") \ No newline at end of file + app.add_js_file("js/top-navigation.js", defer="defer") diff --git a/docs/images/evadb/eva-arch-for-user.png b/docs/images/evadb/eva-arch-for-user.png new file mode 100644 index 0000000000..744a04c9f3 Binary files /dev/null and b/docs/images/evadb/eva-arch-for-user.png differ diff --git a/docs/index.rst b/docs/index.rst index d607a4651b..471829b8f5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,43 +1,37 @@ .. meta:: - :keywords: database, deep learning, video analytics, language models + :keywords: database, AI, language models, SQL, video analytics -Welcome to EvaDB! +Welcome to EvaDB! ================= -.. rubric:: Database system for building simpler and faster AI-powered apps. - -.. - .. figure:: https://raw.githubusercontent.com/georgia-tech-db/evadb/master/docs/images/evadb/evadb-banner.png - :target: https://github.com/georgia-tech-db/evadb - :width: 100% - :alt: EvaDB Banner - -.. |pypi_status| image:: https://img.shields.io/pypi/v/evadb.svg - :target: https://pypi.org/project/evadb -.. |License| image:: https://img.shields.io/badge/license-Apache%202-brightgreen.svg?logo=apache - :target: https://github.com/georgia-tech-db/evadb/blob/master/LICENSE.txt - - -|pypi_status| |License| +.. rubric:: Bring AI inside your Database System ---------- -EvaDB is an open-source unified framework for developing AI-powered apps on top of your data sources. It offers a SQL-like declarative language to simplify the development and deployment of AI-powered apps, which can work with structured data (such as tables and feature stores) and unstructured data (like videos, text, podcasts, PDFs, and more). +EvaDB enables software developers to build AI apps in a few lines of code. Its powerful SQL API simplifies AI app development for both structured and unstructured data. EvaDB's benefits include: -- Github: https://github.com/georgia-tech-db/evadb -- PyPI: https://pypi.org/project/evadb/ -- Twitter: https://twitter.com/evadb_ai -- Slack: https://join.slack.com/t/eva-db/shared_invite/zt-1i10zyddy-PlJ4iawLdurDv~aIAq90Dg +- 🔮 Easy to connect EvaDB with your SQL database system and build AI-powered apps with SQL queries +- 🤝 Query your data with a pre-trained AI model from Hugging Face, OpenAI, YOLO, PyTorch, and other AI frameworks +- ⚡️ Faster queries thanks to AI-centric query optimization +- 💰 Save money spent on running models by efficient CPU/GPU use +- 🔧 Fine-tune your AI models to achieve better results +👋 Hey! If you're excited about our vision of bringing AI inside database systems, show some ❤️ by: + +- 🐙 giving a ⭐ on our EvaDB repo on Github: https://github.com/georgia-tech-db/evadb +- 📟 joining our Slack Community: https://evadb.ai/community +- 🐦 following us on Twitter: https://twitter.com/evadb_ai +- 📝 following us on Medium: https://medium.com/evadb-blog Why EvaDB? ---------- -Over the last decade, AI models have radically changed the world of natural language processing and computer vision. They are accurate on various tasks ranging from question answering to object tracking in videos. However, it is challenging for users to leverage these models due to two challenges: +In the world of AI, we've reached a stage where many AI tasks that were traditionally handled by AI or ML engineers can now be automated. EvaDB enables software developers with the ability to perform advanced AI tasks without needing to delve into the intricate details. + +EvaDB covers many AI applications, including regression, classification, image recognition, question answering, and many other generative AI applications. EvaDB targets 99% of AI problems that are often repetitive and can be automated with a simple function call in an SQL query. Until now, there is no comprehensive open-source framework for bringing AI into an existing SQL database system with a principled AI optimization framework, and that's where EvaDB comes in. -- **Usability**: To use an AI model, the user needs to program against multiple low-level libraries, like PyTorch, Hugging Face, Open AI, etc. This tedious process often leads to a complex AI app that glues together these libraries to accomplish the given task. This programming complexity prevents people who are experts in other domains from benefiting from these models. +Our target audience is software developers who may not necessarily have a background in AI but require AI capabilities to solve specific problems. We target programmers who write simple SQL queries inside their CRUD apps. With EvaDB, it is possible to easily add AI features to these apps by calling built-in AI functions in the queries. -- **Money & Time**: Running these deep learning models on large document or video datasets is costly and time-consuming. For example, the state-of-the-art object detection model takes multiple GPU years to process just a week's videos from a single traffic monitoring camera. Besides the money spent on hardware, these models also increase the time that you spend waiting for the model inference to finish. Getting Started ---------------- @@ -48,7 +42,7 @@ Getting Started
-

Learn basics

+

Learn Basics

Understand how to use EvaDB to build AI apps.

Understand how to use EvaDB to build AI apps.

@@ -58,14 +52,16 @@ Getting Started
-

Features

+

Key Concepts

-

Learn about the EvaDB features.

-

Learn about the EvaDB features.

+

Learn the + high-level concepts related to EvaDB.

+

+ Learn the high-level concepts related to EvaDB.

Learn more >

-
@@ -77,30 +73,3 @@ Getting Started

Support >

- - -Key Features ------------- - -- 🔮 Build simpler AI-powered apps using short Python functions or SQL queries -- ⚡️ 10x faster AI apps using AI-centric query optimization -- 💰 Save money spent on GPUs -- 🚀 First-class support for your custom deep learning models through user-defined functions -- 📦 Built-in caching to eliminate redundant model invocations across queries -- ⌨️ First-class support for PyTorch, Hugging Face, YOLO, and Open AI models -- 🐍 Installable via pip and fully implemented in Python - - - -Try it out! ------------- - -- `PrivateGPT `_ -- `Video Question Answering using ChatGPT `_ -- `Querying PDF documents `_ -- `Analyzing traffic flow at an intersection `_ -- `Examining the emotion palette of actors in a movie `_ -- `Classifying images based on their content `_ - - - diff --git a/docs/source/benchmarks/text_summarization.rst b/docs/source/benchmarks/text_summarization.rst index 0f93896b42..119803f390 100644 --- a/docs/source/benchmarks/text_summarization.rst +++ b/docs/source/benchmarks/text_summarization.rst @@ -1,21 +1,27 @@ -Text summarization benchmark -==== -In this benchmark, we compare the performance of text summarization between EvaDB and MindsDB on `CNN-DailyMail News `_. +Text Summarization Benchmark +============================ -1. Prepare dataset ----- +In this benchmark, we compare the runtime performance of EvaDB and MindsDB on +a text summarization application operating on a news dataset. In particular, +we focus on the `CNN-DailyMail News `_ dataset. + +All the relevant files are located in the `text summarization benchmark folder on Github `_. + +Prepare dataset +--------------- .. code-block:: bash cd benchmark/text_summarization bash download_dataset.sh -2. Using EvaDB to summarize the CNN DailyMail News ----- +Use EvaDB for Text Summarization +-------------------------------- .. note:: - Install ray in your EvaDB virtual environment: ``pip install evadb[ray]`` + Install ``ray`` along with EvaDB to speed up the queries: + ``pip install evadb[ray]`` .. code-block:: bash @@ -23,11 +29,53 @@ In this benchmark, we compare the performance of text summarization between EvaD python text_summarization_with_evadb.py -3. Using MindsDB to summarize the CNN DailyMail News ----- +Loading Data Into EvaDB +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + CREATE TABLE IF NOT EXISTS cnn_news_test( + id TEXT(128), + article TEXT(4096), + highlights TEXT(1024) + ); + +Creating Text Summarization Function in EvaDB +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + CREATE UDF IF NOT EXISTS TextSummarizer + TYPE HuggingFace + TASK 'summarization' + MODEL 'sshleifer/distilbart-cnn-12-6' + MIN_LENGTH 5 + MAX_LENGTH 100; + + +Tuning EvaDB for Maximum GPU Utilization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + cursor._evadb.config.update_value("executor", "batch_mem_size", 300000) + cursor._evadb.config.update_value("executor", "gpu_ids", [0,1]) + cursor._evadb.config.update_value("experimental", "ray", True) + -Prepare sqlite database for MindsDB -**** +Text Summarization Query in EvaDB +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: sql + + CREATE TABLE IF NOT EXISTS cnn_news_summary AS + SELECT TextSummarizer(article) FROM cnn_news_test; + +Use MindsDB for Text Summarization +----------------------------------- + +Setup SQLite Database +~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash @@ -38,25 +86,26 @@ Prepare sqlite database for MindsDB Install MindsDB -**** -Follow the `Setup for Source Code via pip `_ to install mindsdb. +~~~~~~~~~~~~~~~ + +Follow the `MindsDB nstallation guide `_ to install it via ``pip``. .. note:: - At the time of this documentation, we need to manully ``pip install evaluate`` for huggingface model to work in MindsDB. + You will need to manually run ``pip install evaluate`` for the ``HuggingFace`` model to work in MindsDB. -After the installation, we use mysql cli to connect to MindsDB. Replace the port number as needed. +After installation, use the ``MySQL`` client for connecting to ``MindsDB``. Update the port number if needed. .. code-block:: bash mysql -h 127.0.0.1 --port 47335 -u mindsdb -p -Run Experiment -**** +Benchmark MindsDB +~~~~~~~~~~~~~~~~~ -Connect the sqlite database we created before. +Connect ``MindsDB`` to the ``sqlite`` database we created before: -.. code-block:: sql +.. code-block:: text CREATE DATABASE sqlite_datasource WITH ENGINE = 'sqlite', @@ -64,9 +113,9 @@ Connect the sqlite database we created before. "db_file": "cnn_news_test.db" }; -Create text summarization model and wait for its readiness. +Create a ``text summarization`` model and wait for it to be ``ready``. -.. code-block:: sql +.. code-block:: text CREATE MODEL mindsdb.hf_bart_sum_20 PREDICT PRED @@ -80,9 +129,9 @@ Create text summarization model and wait for its readiness. DESCRIBE mindsdb.hf_bart_sum_20; -Use the model to summarize the CNN DailyMail news. +Use the ``text summarization`` model to summarize the CNN news dataset: -.. code-block:: sql +.. code-block:: text CREATE OR REPLACE TABLE sqlite_datasource.cnn_news_summary ( SELECT PRED @@ -91,19 +140,33 @@ Use the model to summarize the CNN DailyMail news. ); -4. Experiment results ----- -Below are nubmers from a server with 56 Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz and two Quadro P6000 GPU. - -.. list-table:: Text summarization with ``sshleifer/distilbart-cnn-12-6`` on CNN-DailyMail News - - * - - - MindsDB - - EvaDB (off-the-shelf) - - EvaDB (full GPU utilization) - * - Time - - 4 hours 45 mins 47.56 secs - - 1 hour 9 mins 39.8 secs - - 42 mins 50.22 secs - - +Benchmarking Results +-------------------- + +Here are the key runtime metrics for the ``Text Summarization`` benchmark. + +The experiment is conducted on a server with 56 Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz and two Quadro P6000 GPUs. + +.. list-table:: + :widths: 20 30 30 30 + + * - + - **MindsDB** + - **EvaDB** + - **EvaDB** + * - + - **(off-the-shelf)** + - **(off-the-shelf)** + - **(tuned for maximum** + * - + - + - + - **GPU utilization)** + * - **Runtime** + - 4 hours 45 mins + - 1 hour 10 mins + - 43 mins + * - **Speedup** + - 1x + - 4.1x + - 6.3x diff --git a/docs/source/dev-guide/contribute.rst b/docs/source/dev-guide/contribute.rst index 8f580a41c8..80d9a4d43a 100644 --- a/docs/source/dev-guide/contribute.rst +++ b/docs/source/dev-guide/contribute.rst @@ -1,3 +1,5 @@ +.. _contributing: + Contributing ---------------- @@ -6,8 +8,8 @@ We welcome all kinds of contributions to EvaDB. - `Code reviews `_ - `Improving documentation `_ - `Tutorials and applications `_ -- `New features `__ +- :ref:`Extending EvaDB` This document goes over how to contribute to EvaDB in details. -.. tableofcontents:: \ No newline at end of file +.. tableofcontents:: diff --git a/docs/source/dev-guide/contribute/setup-dev.rst b/docs/source/dev-guide/contribute/setup-dev.rst index 0e9dd1225c..6c9f691379 100644 --- a/docs/source/dev-guide/contribute/setup-dev.rst +++ b/docs/source/dev-guide/contribute/setup-dev.rst @@ -27,10 +27,10 @@ After installing the package locally, you can make changes and run the test case .. note:: EvaDB provides multiple installation options for extending its functionalities. - Please see :doc:`Installation Guide ` for all options. + Please see :ref:`installation options` for all options. Other options can be installed with the ``dev`` environment. .. code-block:: bash - pip install -e ".[dev,ludwig]" \ No newline at end of file + pip install -e ".[dev,ludwig]" diff --git a/docs/source/dev-guide/contribute/submit-pr.rst b/docs/source/dev-guide/contribute/submit-pr.rst index 59040fc4ff..1fc2917e07 100644 --- a/docs/source/dev-guide/contribute/submit-pr.rst +++ b/docs/source/dev-guide/contribute/submit-pr.rst @@ -18,7 +18,7 @@ Merge the most recent changes from the ``staging`` branch git pull . origin/staging Testing -------- +----------------- Run the :doc:`test script <./testing>` to ensure that all the test cases pass. @@ -35,4 +35,4 @@ Run the following command to ensure that code is properly formatted. .. code-block:: python - python script/formatting/formatter.py \ No newline at end of file + python script/formatting/formatter.py diff --git a/docs/source/dev-guide/extend.rst b/docs/source/dev-guide/extend.rst index 05d3d27a4d..69a119e7da 100644 --- a/docs/source/dev-guide/extend.rst +++ b/docs/source/dev-guide/extend.rst @@ -1,5 +1,7 @@ +.. _Extending EvaDB: + Extending EvaDB -==== +=============== This document details steps involved in extending EvaDB. .. tableofcontents:: diff --git a/docs/source/dev-guide/extend/new-command.rst b/docs/source/dev-guide/extend/new-command.rst index 337a78adc9..c5963c8aef 100644 --- a/docs/source/dev-guide/extend/new-command.rst +++ b/docs/source/dev-guide/extend/new-command.rst @@ -1,10 +1,10 @@ Operators / Commands -============= +==================== This document details the steps involved in adding support for a new operator (or command) in EvaDB. We illustrate the process using a DDL command. Command Handler ----- +--------------- An input query string is handled by **Parser**, **StatementTOPlanConverter**, **PlanGenerator**, and **PlanExecutor**. @@ -83,7 +83,7 @@ parser/ .. _2-statementtoplanconverter: 2. Statement To Plan Converter ---------------------------- +------------------------------ The part transforms the statement into corresponding logical plan. @@ -154,13 +154,13 @@ Optimizer .. _3-plangenerator: 3. Plan Generator ----------------- +------------------ The part transformed logical plan to physical plan. The modified files are stored under **Optimizer** and **Planner** folders. plan_nodes/ -~~~~~~~~ +~~~~~~~~~~~~ - ``[cmd]_plan.py`` - class [cmd]Plan, which stored information required for rename table. @@ -220,7 +220,7 @@ optimizer/rules .. _4-PlanExecutor: 4. Plan Executor --------------- +----------------- ``PlanExecutor`` uses data stored in physical plan to run the command. diff --git a/docs/source/dev-guide/extend/new-data-source.rst b/docs/source/dev-guide/extend/new-data-source.rst index d8d37483b9..fa939a8240 100644 --- a/docs/source/dev-guide/extend/new-data-source.rst +++ b/docs/source/dev-guide/extend/new-data-source.rst @@ -1,19 +1,21 @@ +.. _add-data-source: + Structured Data Source Integration -==== -This document details steps invovled in adding a new structured data source integration in EvaDB. +================================== +This document details steps involved in adding a new structured data source integration in EvaDB. Example Data Source Integration In EvaDB ----- +---------------------------------------- - `PostgreSQL `_ Create Data Source Handler ----- +-------------------------- 1. Create a new directory at `evadb/third_party/databases/ `_ -~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: @@ -32,7 +34,7 @@ The *__init__.py* can contain copyright information. The *requirements.txt* cont EvaDB will only install a data source's specific dependency libraries when a connection to the data source is created by the user via, e.g., `CREATE DATABASE mydb_source WITH ENGINE = "mydb";`. 2. Implement the data source handler -~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In *mydb_handler.py*, you need to implement the `DBHandler` declared at `evadb/third_party/databases/types.py `_. There are 7 functions that you need to implement: @@ -70,13 +72,13 @@ You can get the data source's configuration parameters from `__init__(self, name .. note:: - Those paramters will be specified when the user creates a connection to the data source: `CREATE DATABASE mydb_source WITH ENGINE = "mydb", PARAMETERS = {"host": "localhost", "port": "5432", "user": "eva", "password": "password"};`. + Those parameters will be specified when the user creates a connection to the data source: `CREATE DATABASE mydb_source WITH ENGINE = "mydb", PARAMETERS = {"host": "localhost", "port": "5432", "user": "eva", "password": "password"};`. You can check the PostgreSQL's handler example at `evadb/third_party/databases/postgres/postgres_handler.py `_ for ideas. Register the Data Source Handler ----- +-------------------------------- Add your created data source handler in `get_database_handler` function at `evadb/third_party/databases/interface.py `_. Below is an example of registering the created mydb data source: @@ -87,3 +89,10 @@ Add your created data source handler in `get_database_handler` function at `evad return mod.MydbHandler(engine, **kwargs) ... +Add the Data Source in Documentation +------------------------------------ + +Add your new data source into :ref:`databases` section for reference. + +- Create ``mydb.rst`` under `evadb/docs/source/reference/databases `_ directory. You can refer to the existing documentation under the directory for example information to be covered in ``mydb.rst``. +- Update ``source/reference/databases/postgres`` in `evadb/docs/_toc.yml `_. diff --git a/docs/source/dev-guide/internals.rst b/docs/source/dev-guide/internals.rst deleted file mode 100644 index 88370d2204..0000000000 --- a/docs/source/dev-guide/internals.rst +++ /dev/null @@ -1,44 +0,0 @@ -EvaDB Internals -============= - -Path of a Query -------------------- - -The following code represents a sequence of operations that can be used to execute a query in a evaql database. found in `evadb/server/command_handler.py `_ - - Parse the query using the Parser() function provided by the evaql library. The result of this step will be a parsed representation of the query in the form of an abstract syntax tree (AST). - -.. code:: python - - stmt = Parser().parse(query)[0] - -Bind the parsed AST to a statement context using the StatementBinder() function. This step resolves references to schema objects and performs other semantic checks on the query. - -.. code:: python - - StatementBinder(StatementBinderContext()).bind(stmt) - -Convert the bound AST to a logical plan using the StatementToPlanConvertor() function. This step generates a logical plan that specifies the sequence of operations needed to execute the query. - -.. code:: python - - l_plan = StatementToPlanConvertor().visit(stmt) - -Generate a physical plan from the logical plan using the plan_generator.build() function. This step optimizes the logical plan and generates a physical plan that specifies how the query will be executed. - -.. code:: python - - p_plan = plan_generator.build(l_plan) - -Execute the physical plan using the PlanExecutor() function. This step retrieves the data from the database and produces the final output of the query. - -.. code:: python - - output = PlanExecutor(p_plan).execute_plan() - -Overall, this sequence of operations represents the path of query execution in a evaql database, from parsing the query to producing the final output. - -Topics ------- - -.. tableofcontents:: diff --git a/docs/source/dev-guide/internals/catalog.rst b/docs/source/dev-guide/internals/catalog.rst deleted file mode 100644 index a5236f5ffb..0000000000 --- a/docs/source/dev-guide/internals/catalog.rst +++ /dev/null @@ -1,36 +0,0 @@ -Catalog -======= - -Catalog Manager ---------------- - -`catalog_manager `_ provides a set of services to interact with a database that stores metadata about tables, columns, and user-defined functions (UDFs). Information like what is the data type in a certain column in a table, type of a table, its name, etc.. It contains functions to get, insert and delete catalog entries for Tables, UDFs, UDF IOs, Columns and Indexes. - -This data is stored in the evadb_catalog.db file which can be found in `~/.evadb//` folder. - -Catalog manager currently has 5 services in it: - -.. code:: python - - TableCatalogService() - ColumnCatalogService() - UdfCatalogService() - UdfIOCatalogService() - IndexCatalogService() - -Catalog Services ----------------- - -This class provides functionality related to a table catalog, including inserting, getting, deleting, and renaming table entries, as well as retrieving all entries. e.g. the TableCatalogService contains code to get, insert and delete a table. - -Catalog Models --------------- - -These contain the data model that is used by the catalog services. -Each model represents a table in the underlying database. - -.. |pic1| image:: ../../../images/reference/catalog.png - :width: 45% - :alt: Catalog Diagram - -|pic1| diff --git a/docs/source/dev-guide/packaging.rst b/docs/source/dev-guide/packaging.rst index 9b16b0f391..3bcf6b589b 100644 --- a/docs/source/dev-guide/packaging.rst +++ b/docs/source/dev-guide/packaging.rst @@ -1,3 +1,5 @@ +:orphan: + .. _guide-packaging: Packaging @@ -5,15 +7,13 @@ Packaging This section describes practices to follow when packaging your own models or datasets to be used along with EvaDB. -.. tableofcontents:: - Models -------- Please follow the following steps to package models: -* Create a folder with a descriptive name. This folder name will be used by the UDF that is invoking your model. -* Place all files used by the UDF inside this folder. These are typically: +* Create a folder with a descriptive name. This folder name will be used by the function that is invoking your model. +* Place all files used by the function inside this folder. These are typically: * Model weights (The .pt files that contain the actual weights) * Model architectures (The .pt files that contain model architecture information) * Label files (Extra files that are used in the process of model inference for outputting labels.) diff --git a/docs/source/dev-guide/release.rst b/docs/source/dev-guide/release.rst index 3d690348ce..0d43cf7a84 100644 --- a/docs/source/dev-guide/release.rst +++ b/docs/source/dev-guide/release.rst @@ -1,5 +1,5 @@ EvaDB Release Guide ----- +------------------- Before you start, make sure you have :doc:`setup a PyPI account <./release/pypi-account>`. diff --git a/docs/source/dev-guide/release/pypi-account.rst b/docs/source/dev-guide/release/pypi-account.rst index d7723cc773..c28e94c91f 100644 --- a/docs/source/dev-guide/release/pypi-account.rst +++ b/docs/source/dev-guide/release/pypi-account.rst @@ -1,5 +1,5 @@ Setup PyPI Account -==== +================== Make sure you have `PyPI `_ account with maintainer access to the EvaDB project. Create a .pypirc in your home directory. It should look like this: @@ -15,4 +15,4 @@ Create a .pypirc in your home directory. It should look like this: username=YOUR_USERNAME password=YOUR_PASSWORD -Then run ``chmod 600 ./.pypirc`` so that only you can read/write the file. \ No newline at end of file +Then run ``chmod 600 ./.pypirc`` so that only you can read/write the file. diff --git a/docs/source/dev-guide/release/release-steps.rst b/docs/source/dev-guide/release/release-steps.rst index 5a6e6bea26..baf33e38b3 100644 --- a/docs/source/dev-guide/release/release-steps.rst +++ b/docs/source/dev-guide/release/release-steps.rst @@ -1,5 +1,5 @@ Release Steps -==== +============= 1. Ensure that you're in the top-level ``eva`` directory. 2. Ensure that your branch is in sync with the ``master`` branch: @@ -66,7 +66,7 @@ merged since the last release. pip install twine # if not installed twine upload dist/* -r pypi -11. A PR is automatically submitted (this will take a few hours) on [`conda-forge/eva-feedstock`](https://github.com/conda-forge/eva-feedstock) to update the version. +11. A PR is automatically submitted (this will take a few hours) on [`conda-forge/eva-feedstock`] to update the version. * A maintainer needs to accept and merge those changes. 12. Create a new release on Github. @@ -106,4 +106,4 @@ merged since the last release. * For the tag ``v0.9.1`` * For ``latest`` -Credits: `Snorkel `_ \ No newline at end of file +Credits: `Snorkel `_ diff --git a/docs/source/overview/command-line.rst b/docs/source/overview/command-line.rst index 74da7ae6c6..9f518926b7 100644 --- a/docs/source/overview/command-line.rst +++ b/docs/source/overview/command-line.rst @@ -1,5 +1,7 @@ +:orphan: + Command Line Client -==== +=================== Besides Python files and Jupyter notebooks, EvaDB also supports a command line interface for querying the data. This interface allows for quick querying from the terminal: diff --git a/docs/source/overview/concepts.rst b/docs/source/overview/concepts.rst index c6ac3a354b..1c9317114d 100644 --- a/docs/source/overview/concepts.rst +++ b/docs/source/overview/concepts.rst @@ -2,132 +2,84 @@ Concepts ========= -These are some high-level concepts related to EvaDB. If you still have questions after reading this documents, ping us on `our Slack `__! +Here is a list of key concepts in EvaDB. If you have any questions, ask the community on `Slack `__. +EvaQL: AI-Centric Query Language +-------------------------------- -Quickly build AI-Powered Apps ---------------------------------- +EvaDB supports a SQL-like query language, called ``EvaQL``, designed to assist software developers in bringing AI into their applications. -EvaDB supports a simple SQL-like query language designed to make it easier for users to leverage AI models. It is easy to chain multiple models in a single query to accomplish complicated tasks with minimal programming. +Here is set of illustrative EvaQL queries for a ChatGPT-based video question answering app. This EvaDB app connects to collection of news videos stored in a folder and runs an AI query for extracting audio transcripts from the videos using a Hugging Face model, followed by another AI query for question answering using ChatGPT. -Here is an illustrative EvaDB app for ChatGPT-based question answering on videos. The app loads a collection of news videos into EvaDB and runs a query for extracting audio transcripts from the videos using a HuggingFace model, followed by question answering using ChatGPT. +.. code-block::sql -.. code-block:: python - - # pip install evadb and import it - import evadb + --- Load a collection of news videos into the 'news_videos' table + --- This command returns a Pandas Dataframe with the query's output + --- In this case, the output indicates the number of loaded videos + LOAD VIDEO 'news_videos/*.mp4' INTO VIDEOS; - # Grab a evadb cursor to load data and run queries - cursor = evadb.connect().cursor() - # Load a collection of news videos into the 'news_videos' table - # This command returns a Pandas Dataframe with the query's output - # In this case, the output indicates the number of loaded videos - cursor.load( - file_regex="news_videos/*.mp4", - format="VIDEO", - table_name="news_videos" - ).df() + --- Define an AI function that wraps around a speech-to-text model + --- This model is hosted on Hugging Face which has built-in support in EvaDB + --- After creating the function, we can use the function in any future query + CREATE UDF SpeechRecognizer + TYPE HuggingFace + TASK 'automatic-speech-recognition' + MODEL 'openai/whisper-base'; - # Define a function that wraps around a speech-to-text (Whisper) model - # After creating the function, we can use the function in any future query - cursor.create_function( - udf_name="SpeechRecognizer", - type="HuggingFace", - task='automatic-speech-recognition', - model='openai/whisper-base' - ).df() + -- EvaDB automatically extracts the audio from the videos + --- We only need to run the SpeechRecognizer UDF on the 'audio' column + --- to get the transcript and persist it in a table called 'transcripts' + CREATE TABLE transcripts AS + SELECT SpeechRecognizer(audio) from news_videos; - # EvaDB automatically extract the audio from the video - # We only need to run the SpeechRecognizer UDF on the 'audio' column - # to get the transcript and persist it in a table called 'transcripts' + --- Lastly, we run the ChatGPT query for question answering + --- This query is based on the 'transcripts' table + --- The 'transcripts' table has a column called 'text' with the transcript text + --- Since ChatGPT is a built-in function in EvaDB, we don't have to define it + --- We can directly use it in any query + --- We need to set the OPENAI_KEY as an environment variable + SELECT ChatGPT('Is this video summary related to Ukraine russia war', text) + FROM TEXT_SUMMARY; +EvaQL reduces the complexity of the app, leading to more maintainable code that allows developers to build on top of each other's queries. A single AI query can use multiple AI models to accomplish complicated tasks with minimal programming. - cursor.query( - """CREATE TABLE transcripts AS - SELECT SpeechRecognizer(audio) from news_videos;""" - ).df() +AI-Centric Query Optimization +----------------------------- - # We next incrementally construct the ChatGPT query using EvaDB's Python API - # The query is based on the 'transcripts' table - # This table has a column called 'text' with the transcript text - query = cursor.table('transcripts') +EvaDB optimizes the AI queries to save money spent on running models and reduce query execution time. It contains a novel `Cascades-style extensible query optimizer `__ tailored for AI queries. Query optimization has powered traditional SQL database systems for several decades. It is the bridge that connects the declarative query language to efficient query execution on hardware. - # Since ChatGPT is a built-in function, we don't have to define it - # We can just directly use it in the query - # We need to set the OPENAI_KEY as an environment variable - os.environ["OPENAI_KEY"] = OPENAI_KEY - query = query.select("ChatGPT('Is this video summary related to LLM', text)") +EvaDB accelerates AI queries using a collection of optimizations inspired by SQL database systems including cost-based function predicate reordering, function caching, sampling, etc. - # Finally, we run the query to get the results as a dataframe - response = query.df() +AI Functions +------------ +``Functions`` are typically thin wrappers around AI models and are extensively used in queries. Here is an `illustrative AI function for classifying MNIST images `_. -The same AI query can also be written directly in SQL and run on EvaDB. +To register an user-defined function, use the ``CREATE FUNCTION`` statement: .. code-block:: sql - --- Query for asking question using ChatGPT - SELECT ChatGPT('Is this video summary related to LLM', - SpeechRecognizer(audio)) FROM news_videos; + --- Create an MNIST image classifier function + --- The function's implementation code is in 'mnist_image_classifier.py' + CREATE FUNCTION MnistImageClassifier + IMPL 'mnist_image_classifier.py' -EvaDB's declarative query language reduces the complexity of the app, leading to more maintainable code that allows users to build on top of each other's queries. +After registering ``MnistImageClassifier`` function, you can call the function in the ``SELECT`` and/or ``WHERE`` clauses of any query. -EvaDB comes with a wide range of models for analyzing unstructured data including image classification, object detection, OCR, face detection, etc. It is fully implemented in Python, and `licensed under the Apache license `__. It already contains integrations with widely-used AI pipelines based on Hugging Face, PyTorch, and Open AI. - -The high-level SQL API allows even beginners to use EvaDB in a few lines of code. Advanced users can define custom user-defined functions that wrap around any AI model or Python library. - -Save time and money ----------------------- - -EvaDB automatically optimizes the queries to save inference cost and query execution time using its Cascades-style extensible query optimizer. EvaDB's optimizer is tailored for AI pipelines. The Cascades query optimization framework has worked well in SQL database systems for several decades. Query optimization in EvaDB is the bridge that connects the declarative query language to efficient execution. - -EvaDB accelerates AI pipelines using a collection of optimizations inspired by SQL database systems including function caching, sampling, and cost-based operator reordering. - -EvaDB supports an AI-oriented query language for analyzing both structured and unstructured data. Here are some illustrative apps: - - -The `Getting Started `__ page shows how you can use EvaDB for different AI tasks and how you can easily extend EvaDB to support your custom deep learning model through user-defined functions. - -The `User Guides `__ section contains Jupyter Notebooks that demonstrate how to use various features of EvaDB. Each notebook includes a link to Google Colab, where you can run the code yourself. - - - - -User-Defined Function (UDF) or Function ------------------------------------------- - -User-defined functions are thin wrappers around deep learning models. They -allow us to use deep learning models in AI queries. - -Here is an illustrative UDF for classifying MNIST images. - -.. code-block:: bash - - !wget -nc https://raw.githubusercontent.com/georgia-tech-db/evadb/master/evadb/udfs/mnist_image_classifier.py - -.. code-block:: python - - cursor.create_function("MnistImageClassifier", True, 'mnist_image_classifier.py') - response = cursor.df() - print(response) - -That's it! You can now use the newly registered UDF anywhere in the query -- in the ``select`` or ``filter`` calls. - -.. code-block:: python - - query = cursor.table("MNISTVideo") - query = query.filter("id = 30 OR id = 50 OR id = 70") - - # Here, we are selecting the output of the function - query = query.select("data, MnistImageClassifier(data).label") - response = query.df() - -.. code-block:: python - - query2 = cursor.table("MNISTVideo") +.. code-block:: sql - # Here, we are also filtering based on the output of the function - query2 = query2.filter("MnistImageClassifier(data).label = '6' AND id < 10") - query2 = query2.select("data, MnistImageClassifier(data).label") - response = query2.df() \ No newline at end of file + --- Get the output of 'MnistImageClassifier' on frame id 30 + --- This query returns the results of the image classification function + --- In this case, it is the digit in the 30th frame in the video + SELECT data, id, MnistImageClassifier(data).label + FROM MnistVideo + WHERE id = 30; + + --- Use the 'MnistImageClassifier' function's output to filter frames + --- This query returns the frame ids of the frames with digit 6 + --- We limit to the first five frames containing digit 6 + SELECT data, id, MnistImageClassifier(data).label + FROM MnistVideo + WHERE MnistImageClassifier(data).label = '6' + LIMIT 5; diff --git a/docs/source/overview/concepts/data-sources.rst b/docs/source/overview/concepts/data-sources.rst new file mode 100644 index 0000000000..689e84ac53 --- /dev/null +++ b/docs/source/overview/concepts/data-sources.rst @@ -0,0 +1,20 @@ +.. _data sources: + +Data Sources +============ + +EvaDB simplifies AI app development for two types of data: (1) structured data, and (2) unstructured data. + +Structured Data +--------------- + +Structured data refers to information that is highly organized and follows a predefined format. This format usually involves tables with rows and columns, where each column represents a specific attribute or field, and each row represents a unique record or instance. Structured data is typically found in SQL databases. + +Examples: Customer information in a CRM system, sales transactions in a financial database. + +Unstructured Data +----------------- + +Unstructured data lacks a specific structure or organized format. It doesn't fit neatly into tables with rows and columns. Instead, unstructured data can take the form of text, images, audio, video, social media posts, emails, and more. Unstructured data is diverse and can vary greatly in content, making it challenging to process and analyze without AI models. + +Examples: Social media posts, email content, images on the web. \ No newline at end of file diff --git a/docs/source/overview/connect-to-database.rst b/docs/source/overview/connect-to-database.rst new file mode 100644 index 0000000000..e74de58ca3 --- /dev/null +++ b/docs/source/overview/connect-to-database.rst @@ -0,0 +1,87 @@ +Connect to Database +============================ + +EvaDB supports an extensive range of data sources for structured and unstructured data. + +Connect to a SQL Database System +-------------------------------- + +1. Use the `CREATE DATABASE` statement to connect to an existing SQL database. + +.. code-block:: python + + cursor.query(""" + CREATE DATABASE restaurant_reviews + WITH ENGINE = 'postgres', + PARAMETERS = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "restaurant_reviews" + };""").df() + +.. note:: + + Go over the :ref:`CREATE DATABASE` statement for more details. The :ref:`Databases` page lists all the database systems that EvaDB currently supports. + +2. Preview the Available Data Using `SELECT` + +You can now preview the available data in the `restaurant_reviews` database with a standard :ref:`SELECT` statement. + +.. code-block:: python + + cursor.query(""" + SELECT * + FROM restaurant_reviews.food_review; + """).df() + +3. Run Native Queries in the Connected Database With `USE` + +You can also run native queries directly in the connected database system by the :ref:`USE` statement. + +.. code-block:: python + + cursor.query( + """ + USE restaurant_reviews { + INSERT INTO food_review (name, review) + VALUES ( + 'Customer 1', + 'I ordered fried rice but it is too salty.' + ) + }; + """).df() + + +Load Unstructured Data +----------------------- + +EvaDB supports diverse types of unstructured data. Here are some examples: + +1. Load Images from Local Filesystem + +You can load a collection of images obtained from Reddit from the local filesystem into EvaDB using the :ref:`LOAD` statement. + +.. code-block:: python + + cursor.query(""" + LOAD IMAGE 'reddit-images/*.jpg' + INTO reddit_dataset; + """).df() + +2. Load Video from Cloud Bucket + +You can load a video from an S3 cloud bucket into EvaDB using the :ref:`LOAD` statement. + +.. code-block:: python + + cursor.query(""" + LOAD VIDEO 's3://bucket/eva_videos/mnist.mp4' + INTO MNISTVid; + """).df() + +.. note:: + + Go over the :ref:`LOAD statement` statement for more details on the types of unstructured data that EvaDB supports. + diff --git a/docs/source/overview/docker.rst b/docs/source/overview/docker.rst index 4e71d23d7d..cdbf5019bf 100644 --- a/docs/source/overview/docker.rst +++ b/docs/source/overview/docker.rst @@ -1,5 +1,7 @@ +:orphan: + Docker -==== +====== You can launch the EvaDB server using Docker either locally or on a server with GPUs: diff --git a/docs/source/overview/faq.rst b/docs/source/overview/faq.rst index c3468aac53..28b1e5a70b 100644 --- a/docs/source/overview/faq.rst +++ b/docs/source/overview/faq.rst @@ -1,3 +1,5 @@ +:orphan: + === FAQ === @@ -8,7 +10,7 @@ If you still have questions after reading this FAQ, ping us on `our Slack `__! Why am I not able to install EvaDB in my Python environment? -====================================== +============================================================ Ensure that the local Python version is >= 3.8 and <= 3.10. EvaDB cannot support 3.11 due to its `dependency on Ray `__. @@ -34,7 +36,7 @@ You can either kill that process or launch EvaDB server on another free port in evadb_server -p 9330 Why do I see no output from the server? -==================================== +======================================= If a query runs a complex vision task (such as object detection) on a long video, the query is expected to take a non-trivial amount of time to finish. You can check the status of the server by running ``top`` or ``pgrep``: @@ -48,4 +50,4 @@ pip install ray fails because of grpcio ======================================= Follow these instructions to install ``ray``: -https://github.com/ray-project/ray/issues/33039 \ No newline at end of file +https://github.com/ray-project/ray/issues/33039 diff --git a/docs/source/overview/getting-started.rst b/docs/source/overview/getting-started.rst index bb6b944000..87480e3100 100644 --- a/docs/source/overview/getting-started.rst +++ b/docs/source/overview/getting-started.rst @@ -1,55 +1,115 @@ +.. _getting-started: Getting Started ================= -Install EvaDB ------------------------ +Install EvaDB +------------- -EvaDB supports Python (versions >= `3.8`). To install EvaDB, we recommend using the `pip` package manager: +To install EvaDB, we recommend using the `pip` package manager. + +1. Create a new `virtual environment `_ called `evadb-venv`. .. code-block:: bash - pip install evadb + python -m venv evadb-venv -.. note:: +Now, activate the virtual environment: + +.. code-block:: bash + + source evadb-venv/bin/activate + +2. Once inside the virtual environment, run the command below to mitigate the dependency issues. + +.. code-block:: bash + + pip install --upgrade pip setuptools wheel + +3. Install EvaDB + +.. code-block:: bash + + pip install evadb + +4. Verify EvaDB installation + +.. code-block:: bash - EvaDB provides multiple installation options for extending its functionalities. - Please see :doc:`Installation Guide ` for all options. + pip freeze -Write Your AI App --------------------------- +You should see a list of installed packages including but not limited to the following: -Here is an `illustrative MNIST digit classification app `_ using EvaDB. +.. code-block:: bash + + Package Version + ----------------- ------- + aenum 3.1.15 + decorator 5.1.1 + diskcache 5.6.3 + evadb 0.3.3 + greenlet 2.0.2 + lark 1.1.7 + numpy 1.25.2 + pandas 2.1.0 + ... + +5. Run EvaDB + +Copy the following Python program to a file called `run_evadb.py`. + +The program runs a SQL query for listing all the built-in functions in EvaDB. It consists of importing and connecting to EvaDB, and then running the query. The query's result is returned as a Dataframe. .. code-block:: python - # Connect to EvaDB for running AI queries - import evadb - cursor = evadb.connect().cursor() + # Import the EvaDB package + import evadb + + # Connect to EvaDB and get a database cursor for running queries + cursor = evadb.connect().cursor() + + # List all the built-in functions in EvaDB + print(cursor.query("SHOW FUNCTIONS;").df()) + +Now, run the Python program: - # Load the MNIST video into EvaDB - cursor.load("mnist.mp4", "MNISTVid", format="video").df() +.. code-block:: bash + + python -m run_evadb.py + +You should see a list of built-in functions including but not limited to the following: + +.. code-block:: bash + + name inputs ... impl metadata + 0 ArrayCount [Input_Array NDARRAY ANYTYPE (), Search_Key ANY] ... /home/jarulraj3/evadb/evadb/functions/ndarray/array... [] + 1 Crop [Frame_Array NDARRAY UINT8 (3, None, None), bb... ... /home/jarulraj3/evadb/evadb/functions/ndarray/crop.py [] + 2 ChatGPT [query NDARRAY STR (1,), content NDARRAY STR (... ... /home/jarulraj3/evadb/evadb/functions/chatgpt.py [] - # We now construct an AI pipeline to run the image classifier - # over all the digit images in the video - # Each frame in the loaded MNIST video contains a digit + [3 rows x 6 columns] - # Connect to the table with the loaded video - query = cursor.table("MNISTVid") +.. note:: + Go over the :ref:`Python API` to learn more about `connect()` and `cursor`. - # Run the model on a subset of frames - # Here, id refers to the frame id - query = query.filter("id = 30 OR id = 50 OR id = 70") +.. note:: - # We are retrieving the frame "data" and - # the output of the Image Classification function on the data - query = query.select("data, MnistImageClassifier(data).label") + EvaDB supports additional installation options for extending its functionality. Go over the :doc:`Installation Options ` for all the available options. - # EvaDB uses a lazy query construction technique to improve performance - # Only calling query.df() will run the query - response = query.df() +Illustrative AI Query +--------------------- +Here is an illustrative `MNIST image classification `_ AI query in EvaDB. -The notebook works on `Google Colab `_. +.. code-block:: sql + + --- This AI query retrieves images in the loaded MNIST video with label 4 + --- We constrain the query to only search through the first 100 frames + --- We limit the query to only return the first five frames with label 4 + SELECT data, id, MnistImageClassifier(data) + FROM MnistVideo + WHERE MnistImageClassifier(data) = '4' AND id < 100 + LIMIT 5; -.. image:: ../../images/reference/mnist.png \ No newline at end of file +The complete `MNIST notebook is available on Colab `_. +Try out EvaDB by experimenting with this introductory notebook. diff --git a/docs/source/overview/getting-started/install-guide.rst b/docs/source/overview/getting-started/install-guide.rst deleted file mode 100644 index c054e8fd2c..0000000000 --- a/docs/source/overview/getting-started/install-guide.rst +++ /dev/null @@ -1,59 +0,0 @@ -Installation Guide -================== - -EvaDB provides couple different installation options to allow easy extension to rich functionalities. - -Default -------- - -By Default, EvaDB installs only the minimal requirements. - -.. code-block:: - - pip install evadb - -Vision Capability ------------------ - -You can install EvaDB with the vision extension. -With vision extension, you can run queries to do image classification, object detection, and emotion analysis workloads, etc. - -.. code-block:: - - pip install evadb[vision] - -Documents Summarization with LLM --------------------------------- - -You can also use EvaDB to leverage the capability of LLM to summarize or do question answering for documents. - -.. code-block:: - - pip install evadb[document] - -Additional Vector Index ------------------------ - -EvaDB installs ``faiss`` vector index by default, but users can also install other index library such as ``qdrant`` for similarity search feature. - -.. code-block:: - - pip install evadb[qdrant] - -Training or Finetuning Model ----------------------------- - -Instead of using existing models for only inference, you can also train a customized function inside EvaDB with the ``ludwig`` extension. - -.. code-block:: - - pip install evadb[ludwig] - -Better Performance and Scalability ----------------------------------- - -EvaDB also allows users to improve the query performance by using ``ray`` to parallelize queries. - -.. code-block:: - - pip install evadb[ray] \ No newline at end of file diff --git a/docs/source/overview/getting-started/installation-options.rst b/docs/source/overview/getting-started/installation-options.rst new file mode 100644 index 0000000000..7ab700be3a --- /dev/null +++ b/docs/source/overview/getting-started/installation-options.rst @@ -0,0 +1,17 @@ +.. _installation options: + +Installation Options +==================== + +EvaDB provides the following additional installation options for extending its functionality. + +* ``pip install evadb[vision]`` for installing computer vision packages. They enable use-cases including image classification, object detection, and emotion analysis. + +* ``pip install evadb[document]`` for installing natural language processing packages. They enable use-cases including text summarization, question answering, and sentiment analysis. + +* ``pip install evadb[qdrant]`` for installing the Qdrant vector database system. It enables use-cases related to similarity search based on feature vectors. + +* ``pip install evadb[ludwig]`` for installing the Ludwig model training framework. It enables use-cases related to training and fine-tuning AI models. + +* ``pip install evadb[ray]`` for installing the Ray compute engine. It enables EvaDB to do more efficient query execution on CPUs and GPUs. + diff --git a/docs/source/reference/udfs/custom.rst b/docs/source/reference/ai/custom.rst similarity index 55% rename from docs/source/reference/udfs/custom.rst rename to docs/source/reference/ai/custom.rst index ef8d1ab707..d57a0fe059 100644 --- a/docs/source/reference/udfs/custom.rst +++ b/docs/source/reference/ai/custom.rst @@ -1,58 +1,61 @@ -User-Defined Functions +.. _udf: + + +Functions ====================== -This section provides an overview of how you can create and use a custom user-defined function (UDF) in your queries. For example, you could write an UDF that wraps around your custom PyTorch model. +This section provides an overview of how you can create and use a custom function in your queries. For example, you could write an function that wraps around your custom PyTorch model. -Part 1: Writing a custom UDF ------------------------------- +Part 1: Writing a custom Function +--------------------------------- -During each step, use `this UDF implementation `_ as a reference. +During each step, use `this function implementation `_ as a reference. -1. Create a new file under `udfs/` folder and give it a descriptive name. eg: `yolo_object_detection.py`. +1. Create a new file under `functions/` folder and give it a descriptive name. eg: `yolo_object_detection.py`. .. note:: - UDFs packaged along with EvaDB are located inside the `udfs `_ folder. + Functions packaged along with EvaDB are located inside the `functions `_ folder. -2. Create a Python class that inherits from `PytorchClassifierAbstractUDF`. +2. Create a Python class that inherits from `PytorchClassifierAbstractFunction`. -* The `PytorchClassifierAbstractUDF` is a parent class that defines and implements standard methods for model inference. +* The `PytorchClassifierAbstractFunction` is a parent class that defines and implements standard methods for model inference. * The functions setup and forward should be implemented in your child class. These functions can be implemented with the help of Decorators. Setup ----- -An abstract method that must be implemented in your child class. The setup function can be used to initialize the parameters for executing the UDF. The parameters that need to be set are +An abstract method that must be implemented in your child class. The setup function can be used to initialize the parameters for executing the function. The parameters that need to be set are - cacheable: bool - - True: Cache should be enabled. Cache will be automatically invalidated when the UDF changes. + - True: Cache should be enabled. Cache will be automatically invalidated when the function changes. - False: cache should not be enabled. -- udf_type: str +- function_type: str - - object_detection: UDFs for object detection. + - object_detection: functions for object detection. - batchable: bool - True: Batching should be enabled - False: Batching is disabled. -The custom setup operations for the UDF can be written inside the function in the child class. If there is no need for any custom logic, then you can just simply write "pass" in the function definition. +The custom setup operations for the function can be written inside the function in the child class. If there is no need for any custom logic, then you can just simply write "pass" in the function definition. -Example of a Setup function +Example of a Setup Function .. code-block:: python - @setup(cacheable=True, udf_type="object_detection", batchable=True) + @setup(cacheable=True, function_type="object_detection", batchable=True) def setup(self, threshold=0.85): - #custom setup function that is specific for the UDF + #custom setup function that is specific for the function self.threshold = threshold self.model = torch.hub.load("ultralytics/yolov5", "yolov5s", verbose=False) Forward -------- -An abstract method that must be implemented in your UDF. The forward function receives the frames and runs the deep learning model on the data. The logic for transforming the frames and running the models must be provided by you. +An abstract method that must be implemented in your function. The forward function receives the frames and runs the deep learning model on the data. The logic for transforming the frames and running the models must be provided by you. The arguments that need to be passed are - input_signatures: List[IOColumnArgument] @@ -89,7 +92,7 @@ A sample forward function is given below ], ) def forward(self, frames: Tensor) -> pd.DataFrame: - #the custom logic for the UDF + #the custom logic for the function outcome = [] frames = torch.permute(frames, (0, 2, 3, 1)) @@ -111,39 +114,39 @@ A sample forward function is given below ---------- -Part 2: Registering and using the UDF in EvaDB Queries ------------------------------------------------------- +Part 2: Registering and using the function in EvaDB Queries +----------------------------------------------------------- -Now that you have implemented your UDF, we need to register it as a UDF in EvaDB. You can then use the UDF in any query. +Now that you have implemented your function, we need to register it as a function in EvaDB. You can then use the function in any query. -1. Register the UDF with a query that follows this template: +1. Register the function with a query that follows this template: - `CREATE UDF [ IF NOT EXISTS ] + `CREATE FUNCTION [ IF NOT EXISTS ] IMPL ;` where, - * - specifies the unique identifier for the UDF. - * - specifies the path to the implementation class for the UDF + * - specifies the unique identifier for the function. + * - specifies the path to the implementation class for the function - Here, is an example query that registers a UDF that wraps around the 'YoloObjectDetection' model that performs Object Detection. + Here, is an example query that registers a function that wraps around the 'YoloObjectDetection' model that performs Object Detection. .. code-block:: sql - CREATE UDF YoloDecorators - IMPL 'evadb/udfs/decorators/yolo_object_detection_decorators.py'; + CREATE FUNCTION YoloDecorators + IMPL 'evadb/functions/decorators/yolo_object_detection_decorators.py'; - A status of 0 in the response denotes the successful registration of this UDF. + A status of 0 in the response denotes the successful registration of this function. -2. Now you can execute your UDF on any video: +2. Now you can execute your function on any video: .. code-block:: sql SELECT YoloDecorators(data) FROM MyVideo WHERE id < 5; -3. You can drop the UDF when you no longer need it. +3. You can drop the function when you no longer need it. .. code-block:: sql - DROP UDF IF EXISTS YoloDecorators; \ No newline at end of file + DROP FUNCTION IF EXISTS YoloDecorators; diff --git a/docs/source/reference/udfs/hf.rst b/docs/source/reference/ai/hf.rst similarity index 62% rename from docs/source/reference/udfs/hf.rst rename to docs/source/reference/ai/hf.rst index 2af377791a..8a08313311 100644 --- a/docs/source/reference/udfs/hf.rst +++ b/docs/source/reference/ai/hf.rst @@ -1,25 +1,27 @@ +.. _hf: + HuggingFace Models ====================== This section provides an overview of how you can use out-of-the-box HuggingFace models in EvaDB. -Creating UDF from HuggingFace ------------------------------- +Creating Function from HuggingFace +---------------------------------- -EvaDB supports UDFS similar to `Pipelines `_ in HuggingFace. +EvaDB supports functions similar to `Pipelines `_ in HuggingFace. .. code-block:: sql - CREATE UDF IF NOT EXISTS HFObjectDetector + CREATE FUNCTION IF NOT EXISTS HFObjectDetector TYPE HuggingFace - 'task' 'object-detection' - 'model' 'facebook / detr-resnet-50' + TASK 'object-detection' + MODEL 'facebook / detr-resnet-50' EvaDB supports all arguments supported by HF pipelines. You can pass those using a key value format similar to task and model above. Supported Tasks ------ +---------------- EvaDB supports the following tasks from huggingface: - Audio Classification @@ -32,4 +34,4 @@ EvaDB supports the following tasks from huggingface: - Image Segmentation - Image-to-Text - Object Detection -- Depth Estimation \ No newline at end of file +- Depth Estimation diff --git a/docs/source/reference/udfs/index.rst b/docs/source/reference/ai/index.rst similarity index 68% rename from docs/source/reference/udfs/index.rst rename to docs/source/reference/ai/index.rst index 5f109d96e4..a25fd58815 100644 --- a/docs/source/reference/udfs/index.rst +++ b/docs/source/reference/ai/index.rst @@ -1,7 +1,7 @@ Models ------------------------------------------ -EvaDB facilitates the utilization of thin wrappers around deep learning, commonly referred to as User Defined Functions (UDFs). These UDFs enable the incorporation of deep learning models into AI queries. +EvaDB facilitates the utilization of thin wrappers around deep learning, commonly referred to as functions. These functions enable the incorporation of deep learning models into AI queries. This section compiles a comprehensive catalog of the model integrations that EvaDB supports. diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst new file mode 100644 index 0000000000..75602a0352 --- /dev/null +++ b/docs/source/reference/ai/model-forecasting.rst @@ -0,0 +1,37 @@ +Time Series Forecasting +======================== + +You can train a forecasting model easily in EvaDB. + +.. note:: + + Install `statsforecast` in your EvaDB virtual environment: ``pip install eva[forecasting]``. + +First, we create a table to insert required data. + +.. code-block:: sql + + CREATE TABLE AirData ( + unique_id TEXT(30), + ds TEXT(30), + y INTEGER); + + LOAD CSV 'data/forecasting/air-passengers.csv' INTO AirData; + + +Next, we create a UDF of `TYPE Forecasting`. We must enter the column name on which we wish to forecast using `predict`. Other options include `id` and `time` (they represent the unique id of the items and the time data if available). + +.. code-block:: sql + + CREATE UDF IF NOT EXISTS Forecast FROM + (SELECT y FROM AirData) + TYPE Forecasting + PREDICT 'y'; + +This trains a forecasting model. The model can be called by providing the horizon for forecasting. + +.. code-block:: sql + + SELECT Forecast(12) FROM AirData; + +Here, the horizon is `12`. diff --git a/docs/source/reference/udfs/model-train.rst b/docs/source/reference/ai/model-train.rst similarity index 54% rename from docs/source/reference/udfs/model-train.rst rename to docs/source/reference/ai/model-train.rst index f404f81031..c4b10bcbed 100644 --- a/docs/source/reference/udfs/model-train.rst +++ b/docs/source/reference/ai/model-train.rst @@ -1,5 +1,5 @@ -Model Train and Finetune -==== +Training and Finetuning +======================== 1. You can train a predication model easily in EvaDB @@ -9,40 +9,40 @@ Model Train and Finetune .. code-block:: sql - CREATE UDF IF NOT EXISTS PredictHouseRent FROM + CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM ( SELECT sqft, location, rental_price FROM HomeRentals ) TYPE Ludwig - 'predict' 'rental_price' - 'time_limit' 120; + PREDICT 'rental_price' + TIME_LIMIT 120; -In the above query, you are creating a new customized UDF by automatically training a model from the `HomeRentals` table. The `rental_price` column will be the target column for predication, while `sqft` and `location` are the inputs. +In the above query, you are creating a new customized function by automatically training a model from the `HomeRentals` table. The `rental_price` column will be the target column for predication, while `sqft` and `location` are the inputs. You can also simply give all other columns in `HomeRentals` as inputs and let the underlying automl framework to figure it out. Below is an example query: .. code-block:: sql - CREATE UDF IF NOT EXISTS PredictHouseRent FROM + CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM ( SELECT * FROM HomeRentals ) TYPE Ludwig - 'predict' 'rental_price' - 'time_limit' 120; + PREDICT 'rental_price' + TIME_LIMIT 120; .. note:: Check :ref:`create-udf-train` for available configurations for training models. -2. After training completes, you can use the `PredictHouseRent` like all other UDFs in EvaDB +2. After training completes, you can use the `PredictHouseRent` like all other functions in EvaDB .. code-block:: sql CREATE PredictHouseRent(sqft, location) FROM HomeRentals; -You can also simply give all columns in `HomeRentals` as inputs for inference. The customized UDF with the underlying model can figure out the proper inference columns via the training columns. +You can also simply give all columns in `HomeRentals` as inputs for inference. The customized function with the underlying model can figure out the proper inference columns via the training columns. .. code-block:: sql CREATE PredictHouseRent(*) FROM HomeRentals; -Check out our `Integration Tests `_ for working example. +Check out our `Integration Tests `_ for working example. diff --git a/docs/source/reference/ai/openai.rst b/docs/source/reference/ai/openai.rst new file mode 100644 index 0000000000..d8f24c3b95 --- /dev/null +++ b/docs/source/reference/ai/openai.rst @@ -0,0 +1,29 @@ +.. _openai: + +OpenAI Models +===================== + +This section provides an overview of how you can use OpenAI models in EvaDB. + + +Chat Completion Functions +------------------------- + +To create a chat completion function in EvaDB, use the following SQL command: + +.. code-block:: sql + + CREATE FUNCTION IF NOT EXISTS OpenAIChatCompletion + IMPL 'evadb/functions/openai_chat_completion_function.py' + MODEL 'gpt-3.5-turbo' + +EvaDB supports the following models for chat completion task: + +- "gpt-4" +- "gpt-4-0314" +- "gpt-4-32k" +- "gpt-4-32k-0314" +- "gpt-3.5-turbo" +- "gpt-3.5-turbo-0301" + +The chat completion function can be composed in interesting ways with other functions. Please check the `Google Colab `_ for an example of combining chat completion task with caption extraction and video summarization models from Hugging Face and feeding it to chat completion to ask questions about the results. diff --git a/docs/source/reference/udfs/yolo.rst b/docs/source/reference/ai/yolo.rst similarity index 65% rename from docs/source/reference/udfs/yolo.rst rename to docs/source/reference/ai/yolo.rst index bc083586e3..6149a4d22b 100644 --- a/docs/source/reference/udfs/yolo.rst +++ b/docs/source/reference/ai/yolo.rst @@ -7,13 +7,13 @@ This section provides an overview of how you can use out-of-the-box Ultralytics Creating YOLO Model ------------------- -To create a YOLO UDF in EvaDB using Ultralytics models, use the following SQL command: +To create a YOLO function in EvaDB using Ultralytics models, use the following SQL command: .. code-block:: sql - CREATE UDF IF NOT EXISTS Yolo + CREATE FUNCTION IF NOT EXISTS Yolo TYPE ultralytics - 'model' 'yolov8m.pt' + MODEL 'yolov8m.pt' You can change the `model` value to specify any other model supported by Ultralytics. @@ -30,11 +30,11 @@ The following models are currently supported by Ultralytics in EvaDB: Please refer to the `Ultralytics documentation `_ for more information about these models and their capabilities. -Using Ultralytics Models with Other UDFs ----------------------------------------- +Using Ultralytics Models with Other Functions +--------------------------------------------- This code block demonstrates how the YOLO model can be combined with other models such as Color and DogBreedClassifier to perform more specific and targeted object detection tasks. In this case, the goal is to find images of black-colored Great Danes. -The first query uses YOLO to detect all images of dogs with black color. The ``UNNEST`` function is used to split the output of the ``Yolo`` UDF into individual rows, one for each object detected in the image. The ``Color`` UDF is then applied to the cropped portion of the image to identify the color of each detected dog object. The ``WHERE`` clause filters the results to only include objects labeled as "dog" and with a color of "black". +The first query uses YOLO to detect all images of dogs with black color. The ``UNNEST`` function is used to split the output of the ``Yolo`` function into individual rows, one for each object detected in the image. The ``Color`` function is then applied to the cropped portion of the image to identify the color of each detected dog object. The ``WHERE`` clause filters the results to only include objects labeled as "dog" and with a color of "black". .. code-block:: sql @@ -44,7 +44,7 @@ The first query uses YOLO to detect all images of dogs with black color. The ``U AND Color(Crop(data, bbox)) = 'black'; -The second query builds upon the first by further filtering the results to only include images of Great Danes. The ``DogBreedClassifier`` UDF is used to classify the cropped portion of the image as a Great Dane. The ``WHERE`` clause adds an additional condition to filter the results to only include objects labeled as "dog", with a color of "black", and classified as a "great dane". +The second query builds upon the first by further filtering the results to only include images of Great Danes. The ``DogBreedClassifier`` function is used to classify the cropped portion of the image as a Great Dane. The ``WHERE`` clause adds an additional condition to filter the results to only include objects labeled as "dog", with a color of "black", and classified as a "great dane". .. code-block:: sql diff --git a/docs/source/reference/api.rst b/docs/source/reference/api.rst index 77caf9484a..3fcd416771 100644 --- a/docs/source/reference/api.rst +++ b/docs/source/reference/api.rst @@ -1,77 +1,33 @@ -Basic API -========== +.. _python-api: -To begin your querying session, get a connection with a cursor to EvaDB using ``connect`` and ``cursor`` function calls: +Python API +========== -.. autosummary:: - :toctree: ./doc - - ~evadb.connect - ~evadb.EvaDBConnection.cursor +To begin a querying session in EvaDB, obtain a connection with a cursor using ``connect`` and ``cursor`` functions. After getting the cursor, you can run queries with the ``query`` function in this manner: .. code-block:: python + # Import the EvaDB package import evadb + # Connect to EvaDB and get a database cursor for running queries cursor = evadb.connect().cursor() -After getting a cursor, you can load documents and run queries using the ``EvaDBCursor`` interface. To construct the queries with pandas-like API, -use the ``EvaDBQuery`` interface. - -.. code-block:: python - - ### load the pdfs in a given folder into the "office_data" table - cursor.load( - file_regex=f"office_data/*.pdf", format="PDF", table_name="office_data_table" - ).df() - - ### load a given video into the "youtube_videos" table - cursor.load("movie.mp4", "youtube_videos", "video").df() - -.. warning:: + # List all the built-in functions in EvaDB + print(cursor.query("SHOW UDFS;").df()) - It is important to call ``df`` to run the actual query and get the result dataframe. EvaDB does lazy query execution to improve performance. - Calling ``cursor.query("...")`` will only construct and not run the query. Calling ``cursor.query("...").df()`` will both construct and run the query. - -EvaDBCursor Interface ---------------------- - -Using the cursor, you can refer to a table, load documents, create functions, create vector index, and several other tasks. - -After connecting to a table using ``table``, you can construct a complex query using the ``EvaDBQuery`` interface. - -.. autosummary:: +.. autosummary:: :toctree: ./doc - - ~evadb.EvaDBCursor.table - ~evadb.EvaDBCursor.load + + ~evadb.connect + ~evadb.EvaDBConnection.cursor ~evadb.EvaDBCursor.query - ~evadb.EvaDBCursor.create_function - ~evadb.EvaDBCursor.create_table - ~evadb.EvaDBCursor.create_vector_index - ~evadb.EvaDBCursor.drop_table - ~evadb.EvaDBCursor.drop_function - ~evadb.EvaDBCursor.drop_index ~evadb.EvaDBCursor.df - ~evadb.EvaDBCursor.show - ~evadb.EvaDBCursor.insert - ~evadb.EvaDBCursor.explain - ~evadb.EvaDBCursor.rename -EvaDBQuery Interface ---------------------- +.. warning:: -.. autosummary:: - :toctree: ./doc + It is important to call ``df`` to run the actual query and get the output dataframe. + + ``cursor.query("...")`` only construct the query and not run the query. ``cursor.query("...").df()`` will both construct and run the query. - ~evadb.EvaDBQuery.select - ~evadb.EvaDBQuery.cross_apply - ~evadb.EvaDBQuery.filter - ~evadb.EvaDBQuery.df - ~evadb.EvaDBQuery.alias - ~evadb.EvaDBQuery.limit - ~evadb.EvaDBQuery.order - ~evadb.EvaDBQuery.show - ~evadb.EvaDBQuery.sql_query - ~evadb.EvaDBQuery.execute \ No newline at end of file diff --git a/docs/source/reference/databases/index.rst b/docs/source/reference/databases/index.rst new file mode 100644 index 0000000000..6686d5b998 --- /dev/null +++ b/docs/source/reference/databases/index.rst @@ -0,0 +1,9 @@ +.. _databases: + +Databases +============= + +Below are all supported data sources for EvaDB. We welcome adding new data source integrations in EvaDB. Check :ref:`add-data-source` for guidance. + + +.. tableofcontents:: diff --git a/docs/source/reference/databases/mysql.rst b/docs/source/reference/databases/mysql.rst new file mode 100644 index 0000000000..0a1c67d23f --- /dev/null +++ b/docs/source/reference/databases/mysql.rst @@ -0,0 +1,36 @@ +MySQL +========== + +The connection to MySQL is based on the `mysql-connector-python `_ library. + +Dependency +---------- + +* mysql-connector-python + + +Parameters +---------- + +Required: + +* `user` is the database user. +* `password` is the database password. +* `host` is the host name, IP address, or URL. +* `port` is the port used to make TCP/IP connection. +* `database` is the database name. + + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE mysql_data WITH ENGINE = 'mysql', PARAMETERS = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb" + }; + diff --git a/docs/source/reference/databases/postgres.rst b/docs/source/reference/databases/postgres.rst new file mode 100644 index 0000000000..679cada5e0 --- /dev/null +++ b/docs/source/reference/databases/postgres.rst @@ -0,0 +1,36 @@ +PostgreSQL +========== + +The connection to PostgreSQL is based on the `psycopg2 `_ library. + +Dependency +---------- + +* psycopg2 + + +Parameters +---------- + +Required: + +* `user` is the database user. +* `password` is the database password. +* `host` is the host name, IP address, or URL. +* `port` is the port used to make TCP/IP connection. +* `database` is the database name. + + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE postgres_data WITH ENGINE = 'postgres', PARAMETERS = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb" + }; + diff --git a/docs/source/reference/databases/sqlite.rst b/docs/source/reference/databases/sqlite.rst new file mode 100644 index 0000000000..77086ce013 --- /dev/null +++ b/docs/source/reference/databases/sqlite.rst @@ -0,0 +1,27 @@ +SQLite +========== + +The connection to SQLite is based on the `sqlite3 `_ library. + +Dependency +---------- + +* sqlite3 + + +Parameters +---------- + +Required: + +* `database` is the path to the database file to be opened. You can pass ":memory:" to create an SQLite database existing only in memory, and open a connection to it. + + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE sqlite_data WITH ENGINE = 'sqlite', PARAMETERS = { + "database": "evadb.db" + }; diff --git a/docs/source/reference/evaql.rst b/docs/source/reference/evaql.rst index c2d84305ee..c00f87b7cf 100644 --- a/docs/source/reference/evaql.rst +++ b/docs/source/reference/evaql.rst @@ -1,20 +1,20 @@ EvaDB Query Language Reference -============================ +=============================== EvaDB Query Language (EvaDB) is derived from SQL. It is tailored for AI-driven analytics. EvaDB allows users to invoke deep learning models in the form -of user-defined functions (UDFs). +of functions. -Here is an example where we first define a UDF wrapping around the FastRCNN object detection model. We then issue a query with this function to detect objects. +Here is an example where we first define a function wrapping around the FastRCNN object detection model. We then issue a query with this function to detect objects. .. code:: sql --- Create an user-defined function wrapping around FastRCNN ObjectDetector - CREATE UDF IF NOT EXISTS FastRCNNObjectDetector + CREATE FUNCTION IF NOT EXISTS FastRCNNObjectDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(ANYDIM), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE Classification - IMPL 'evadb/udfs/fastrcnn_object_detector.py'; + IMPL 'evadb/functions/fastrcnn_object_detector.py'; --- Use the function to retrieve frames that contain more than 3 cars SELECT id FROM MyVideo diff --git a/docs/source/reference/evaql/create.rst b/docs/source/reference/evaql/create.rst index 58ad4aa4a4..9023e6613f 100644 --- a/docs/source/reference/evaql/create.rst +++ b/docs/source/reference/evaql/create.rst @@ -1,8 +1,39 @@ CREATE ====== +.. _sql-create-database: + +CREATE DATABASE +--------------- + +The CREATE DATABASE statement allows us to connect to an external structured data store in EvaDB. + +.. code:: text + + CREATE DATABASE [database_connection] + WITH ENGINE = [database_engine], + PARAMETERS = [key_value_parameters]; + +* [database_connection] is the name of the database connection. `[database_connection].[table_name]` will be used as table name to compose SQL queries in EvaDB. +* [database_engine] is the supported database engine. Check :ref:`supported data sources` for all engine and their available configuration parameters. +* [key_value_parameters] is a list of key-value pairs as arguments to establish a connection. + + +Examples +~~~~~~~~ + +.. code:: text + + CREATE DATABASE postgres_data WITH ENGINE = 'postgres', PARAMETERS = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb" + }; + CREATE TABLE ----- +------------ To create a table, specify the schema of the table. @@ -18,38 +49,38 @@ To create a table, specify the schema of the table. object_id INTEGER ); -CREATE UDF ----- +CREATE FUNCTION +--------------- -To register an user-defined function, specify the implementation details of the UDF. +To register an user-defined function, specify the implementation details of the function. .. code-block:: sql - CREATE UDF IF NOT EXISTS FastRCNNObjectDetector + CREATE FUNCTION IF NOT EXISTS FastRCNNObjectDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(ANYDIM), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE Classification - IMPL 'evadb/udfs/fastrcnn_object_detector.py'; + IMPL 'evadb/functions/fastrcnn_object_detector.py'; .. _create-udf-train: -CREATE UDF via Training ----- +CREATE FUNCTION via Training +---------------------------- To register an user-defined function by training a predication model. .. code-block:: sql - CREATE UDF IF NOT EXISTS PredictHouseRent FROM + CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM (SELECT * FROM HomeRentals) TYPE Ludwig - 'predict' 'rental_price' - 'time_list' 120; - 'tune_for_memory' False; + PREDICT 'rental_price' + TIME_LIST 120; + TUNE_FOR_MEMORY False; CREATE MATERIALIZED VIEW ----- +------------------------ To create a view with materialized results -- like the outputs of deep learning model, use the following template: diff --git a/docs/source/reference/evaql/delete.rst b/docs/source/reference/evaql/delete.rst index b97d10869e..782935301f 100644 --- a/docs/source/reference/evaql/delete.rst +++ b/docs/source/reference/evaql/delete.rst @@ -2,10 +2,10 @@ DELETE ======= DELETE INTO TABLE ------ +----------------- Delete a tuple from a table based on a predicate. .. code:: mysql - DELETE FROM MyVideo WHERE id<10; \ No newline at end of file + DELETE FROM MyVideo WHERE id<10; diff --git a/docs/source/reference/evaql/drop.rst b/docs/source/reference/evaql/drop.rst index 6d9e04b05d..67a7234891 100644 --- a/docs/source/reference/evaql/drop.rst +++ b/docs/source/reference/evaql/drop.rst @@ -2,16 +2,16 @@ DROP ==== DROP TABLE ----- +---------- .. code:: mysql DROP TABLE DETRACVideo; -DROP UDF ----- +DROP FUNCTION +------------- .. code:: mysql - DROP UDF FastRCNNObjectDetector; + DROP FUNCTION FastRCNNObjectDetector; diff --git a/docs/source/reference/evaql/explain.rst b/docs/source/reference/evaql/explain.rst index 4971cb13cd..6a0b96bb1a 100644 --- a/docs/source/reference/evaql/explain.rst +++ b/docs/source/reference/evaql/explain.rst @@ -1,8 +1,8 @@ EXPLAIN -==== +======== EXPLAIN QUERY ----- +------------- List the query plan associated with a EvaDB query diff --git a/docs/source/reference/evaql/insert.rst b/docs/source/reference/evaql/insert.rst index af1e8bebdc..4d0914e8f2 100644 --- a/docs/source/reference/evaql/insert.rst +++ b/docs/source/reference/evaql/insert.rst @@ -6,20 +6,20 @@ TABLE MyVideo MyVideo Table schema -.. code:: mysql +.. code:: text CREATE TABLE MyVideo (id INTEGER, data NDARRAY FLOAT32(ANYDIM)); INSERT INTO TABLE ------ +----------------- Insert a tuple into a table. -.. code:: mysql +.. code:: text INSERT INTO MyVideo (id, data) VALUES (1, [[[40, 40, 40] , [40, 40, 40]], - [[40, 40, 40] , [40, 40, 40]]]); \ No newline at end of file + [[40, 40, 40] , [40, 40, 40]]]); diff --git a/docs/source/reference/evaql/load.rst b/docs/source/reference/evaql/load.rst index 5d5a84c0c5..2772ad7992 100644 --- a/docs/source/reference/evaql/load.rst +++ b/docs/source/reference/evaql/load.rst @@ -1,10 +1,12 @@ +.. _sql-load: + LOAD ==== .. _1-load-video-from-filesystem: LOAD VIDEO FROM FILESYSTEM ----- +-------------------------- .. code:: mysql @@ -20,7 +22,7 @@ When a video is loaded, there is no need to specify the schema for the video tab .. _2-load-video-from-s3: LOAD VIDEO FROM S3 ----- +------------------ .. code:: mysql @@ -32,7 +34,7 @@ The videos are downloaded to a directory that can be configured in the EvaDB con .. _3-load-image-from-file: LOAD IMAGE FROM FILESYSTEM ----- +-------------------------- .. code:: mysql @@ -44,7 +46,7 @@ the following schema with two columns: ``name`` and ``data``, that correspond to .. _4-load-the-csv-file: LOAD CSV ----- +-------- To **LOAD** a CSV file, we need to first specify the table schema. diff --git a/docs/source/reference/evaql/rename.rst b/docs/source/reference/evaql/rename.rst index 67dfc6736b..63875dd630 100644 --- a/docs/source/reference/evaql/rename.rst +++ b/docs/source/reference/evaql/rename.rst @@ -2,8 +2,8 @@ RENAME ====== RENAME TABLE ----- +------------ .. code:: sql - RENAME TABLE MyVideo TO MyVideo1; \ No newline at end of file + RENAME TABLE MyVideo TO MyVideo1; diff --git a/docs/source/reference/evaql/select.rst b/docs/source/reference/evaql/select.rst index 2c437cdfc0..81b3f01eaf 100644 --- a/docs/source/reference/evaql/select.rst +++ b/docs/source/reference/evaql/select.rst @@ -1,8 +1,10 @@ +.. _sql-select: + SELECT ====== SELECT FRAMES WITH PREDICATES ----- +----------------------------- Search for frames with a car @@ -29,8 +31,8 @@ Search for frames containing greater than 3 cars WHERE ArrayCount(FastRCNNObjectDetector(data).label, 'car') > 3 ORDER BY id; -SELECT WITH MULTIPLE UDFS ----- +SELECT WITH MULTIPLE FUNCTIONS +------------------------------ Compose multiple user-defined functions in a single query to construct semantically complex queries. diff --git a/docs/source/reference/evaql/show.rst b/docs/source/reference/evaql/show.rst index 2abcc1b776..41a46e17d0 100644 --- a/docs/source/reference/evaql/show.rst +++ b/docs/source/reference/evaql/show.rst @@ -1,11 +1,11 @@ SHOW ==== -SHOW UDFS ----- +SHOW FUNCTIONS +-------------- List the registered user-defined functions .. code:: sql - SHOW UDFS; + SHOW FUNCTIONS; diff --git a/docs/source/reference/evaql/udf.rst b/docs/source/reference/evaql/udf.rst index 9c2693b9c7..883c0e7e8b 100644 --- a/docs/source/reference/evaql/udf.rst +++ b/docs/source/reference/evaql/udf.rst @@ -1,33 +1,35 @@ -UDF -=== +:orphan: -SHOW UDFS ----- +Functions +========= + +SHOW FUNCTIONS +-------------- Here is a list of built-in user-defined functions in EvaDB. .. code:: mysql - SHOW UDFS; + SHOW FUNCTIONS; id name impl - 0 FastRCNNObjectDetector evadb/udfs/fastrcnn_object_detector.p - 1 MVITActionRecognition evadb/udfs/mvit_action_recognition.py - 2 ArrayCount evadb/udfs/ndarray/array_count.py - 3 Crop evadb/evadb/udfs/ndarray/crop.py + 0 FastRCNNObjectDetector evadb/functions/fastrcnn_object_detector.p + 1 MVITActionRecognition evadb/functions/mvit_action_recognition.py + 2 ArrayCount evadb/functions/ndarray/array_count.py + 3 Crop evadb/evadb/functions/ndarray/crop.py FastRCNNObjectDetector is a model for detecting objects. MVITActionRecognition is a model for recognizing actions. ArrayCount and Crop are utility functions for counting the number of objects in an array and cropping a bounding box from an image, respectively. -SELECT WITH MULTIPLE UDFS ----- +SELECT WITH MULTIPLE FUNCTIONS +------------------------------ -Here is a query that illustrates how to use multiple UDFs in a single query. +Here is a query that illustrates how to use multiple functions in a single query. .. code:: sql SELECT id, bbox, EmotionDetector(Crop(data, bbox)) FROM HAPPY JOIN LATERAL UNNEST(FaceDetector(data)) AS Face(bbox, conf) - WHERE id < 15; \ No newline at end of file + WHERE id < 15; diff --git a/docs/source/reference/evaql/use.rst b/docs/source/reference/evaql/use.rst new file mode 100644 index 0000000000..b65b68f5d3 --- /dev/null +++ b/docs/source/reference/evaql/use.rst @@ -0,0 +1,36 @@ +.. _sql-use: + +USE +=== + +The USE statement allows us to run arbitrary native queries in the connected database. + +.. code:: text + + USE [database_connection] { [native_query] }; + +* [database_connection] is an external database connection instanced by the `CREATE DATABASE statement`. +* [native_query] is an arbitrary SQL query supported by the [database_connection]. + +.. warning:: + + Currently EvaDB only supports single query in one USE statement. The [native_query] should not end with semicolon. + +Examples +-------- + +.. code:: text + + USE postgres_data { + DROP TABLE IF EXISTS food_review + }; + + USE postgres_data { + CREATE TABLE food_review (name VARCHAR(10), review VARCHAR(1000)) + }; + + USE postgres_data { + INSERT INTO food_review (name, review) VALUES ('Customer 1', 'I ordered fried rice but it is too salty.') + }; + + diff --git a/docs/source/reference/gpu.rst b/docs/source/reference/gpu.rst index 9381af9657..d5deebd605 100644 --- a/docs/source/reference/gpu.rst +++ b/docs/source/reference/gpu.rst @@ -1,5 +1,7 @@ +:orphan: + Configure GPU ------ +------------- 1. Queries in EvaDB use deep learning models that run much faster on a GPU as opposed to a CPU. If your workstation has a GPU, you can configure EvaDB to use the GPU during query execution. Use the following command to check your hardware capabilities: @@ -30,4 +32,4 @@ Output of `cuda:0` indicates the presence of a GPU. 0 indicates the index of the executor: gpus: {'127.0.1.1': [0]} -Here, `127.0.1.1` is the loopback address on which the EvaDB server is running. 0 refers to the GPU index to be used. \ No newline at end of file +Here, `127.0.1.1` is the loopback address on which the EvaDB server is running. 0 refers to the GPU index to be used. diff --git a/docs/source/reference/io.rst b/docs/source/reference/io.rst index 7dd63df9b4..dd22c8b7ab 100644 --- a/docs/source/reference/io.rst +++ b/docs/source/reference/io.rst @@ -1,10 +1,12 @@ +:orphan: + IO Descriptors ====================== -EvaDB supports three key data types. The inputs and outputs of the user-defined functions (UDFs) must be of one of these types. +EvaDB supports three key data types. The inputs and outputs of the functions must be of one of these types. NumpyArray ------------ -Used when the inputs or outputs of the UDF is of type Numpy Array. +Used when the inputs or outputs of the function is of type Numpy Array. Parameters ------------ @@ -51,11 +53,11 @@ dimensions(*Tuple(int)*): shape of the numpy array PandasDataframe ---------------- -columns (*List[str]*): list of strings that represent the expected column names in the pandas dataframe that is returned from the UDF. +columns (*List[str]*): list of strings that represent the expected column names in the pandas dataframe that is returned from the function. -column_types (*NdArrayType*): expected datatype of the column in the pandas dataframe returned from the UDF. The NdArrayType class is inherited from evadb.catalog.catalog_type. +column_types (*NdArrayType*): expected datatype of the column in the pandas dataframe returned from the function. The NdArrayType class is inherited from evadb.catalog.catalog_type. -column_shapes (*List[tuples]*): list of tuples that represent the expected shapes of columns in the pandas dataframe returned from the UDF. +column_shapes (*List[tuples]*): list of tuples that represent the expected shapes of columns in the pandas dataframe returned from the function. .. code-block:: python diff --git a/docs/source/reference/udfs/openai.rst b/docs/source/reference/udfs/openai.rst deleted file mode 100644 index 5fea7cfcab..0000000000 --- a/docs/source/reference/udfs/openai.rst +++ /dev/null @@ -1,27 +0,0 @@ -OpenAI Models -===================== - -This section provides an overview of how you can use OpenAI models in EvaDB. - - -Chat Completion UDFs --------------------- - -To create a chat completion UDF in EvaDB, use the following SQL command: - -.. code-block:: sql - - CREATE UDF IF NOT EXISTS OpenAIChatCompletion - IMPL 'evadb/udfs/openai_chat_completion_udf.py' - 'model' 'gpt-3.5-turbo' - -EvaDB supports the following models for chat completion task: - -- "gpt-4" -- "gpt-4-0314" -- "gpt-4-32k" -- "gpt-4-32k-0314" -- "gpt-3.5-turbo" -- "gpt-3.5-turbo-0301" - -The chat completion UDF can be composed in interesting ways with other UDFs. Please refer to the `ChatGPT notebook `_ for an example of combining chat completion task with caption extraction and video summarization models from Hugging Face and feeding it to chat completion to ask questions about the results. diff --git a/docs/source/setup.rst b/docs/source/setup.rst index 76f5c9ced3..db4b126cf1 100644 --- a/docs/source/setup.rst +++ b/docs/source/setup.rst @@ -1,3 +1,5 @@ +:orphan: + .. _guide-setup: Setup diff --git a/docs/source/shared/evadb.rst b/docs/source/shared/evadb.rst new file mode 100644 index 0000000000..1708c630c0 --- /dev/null +++ b/docs/source/shared/evadb.rst @@ -0,0 +1,14 @@ +Prerequisites +------------- + +To follow along, you will need to set up a local instance of EvaDB via :ref:`pip`. + +Connect to EvaDB +---------------- + +After installing EvaDB, use the following Python code to establish a connection and obtain a ``cursor`` for running ``EvaQL`` queries. + +.. code-block:: python + + import evadb + cursor = evadb.connect().cursor() diff --git a/docs/source/shared/footer.rst b/docs/source/shared/footer.rst new file mode 100644 index 0000000000..3f5eb6d8b8 --- /dev/null +++ b/docs/source/shared/footer.rst @@ -0,0 +1,11 @@ +What's Next? +------------ + +👋 EvaDB's vision is to bring AI inside your database system and make it easy to build fast AI-powered apps. If you liked this tutorial and are excited about our vision, show some ❤️ by: + +- 🐙 giving a ⭐ for the EvaDB repository on Github: https://github.com/georgia-tech-db/evadb +- 📟 engaging with the EvaDB community on Slack to ask questions and share your ideas and thoughts: https://evadb.ai/community +- 🎉 contributing to EvaDB by developing cool applications/integrations: https://github.com/georgia-tech-db/evadb/issues +- 🐦 following us on Twitter: https://twitter.com/evadb_ai +- 📝 following us on Medium: https://medium.com/evadb-blog + diff --git a/docs/source/shared/nlp.rst b/docs/source/shared/nlp.rst new file mode 100644 index 0000000000..78447ca227 --- /dev/null +++ b/docs/source/shared/nlp.rst @@ -0,0 +1,6 @@ +Leverage Text Processing AI Engines with EvaDB +---------------------------------------------- + +By integrating databases and AI engines using EvaDB, developers can easily extract insights from text data with just a few EvaQL queries. These powerful natural language processing (NLP) models from ``OpenAI`` and ``HuggingFace`` are capable of complex text processing tasks (e.g., answering complex questions with ``context`` obtained from a column in a table). + +EvaDB makes it easy for developers to easily incorporate powerful NLP capabilities into their AI-powered applications while saving time and resources compared to traditional AI development pipelines. \ No newline at end of file diff --git a/docs/source/shared/postgresql.rst b/docs/source/shared/postgresql.rst new file mode 100644 index 0000000000..26906486cf --- /dev/null +++ b/docs/source/shared/postgresql.rst @@ -0,0 +1,38 @@ +Connect EvaDB to PostgreSQL Database Server +------------------------------------------- + +We will assume that you have a ``PostgreSQL`` database server running locally that contains the data needed for analysis. Follow these instructions to install `PostgreSQL `_. + +EvaDB lets you connect to your favorite databases, data warehouses, data lakes, etc., via the ``CREATE DATABASE`` statement. In this query, we connect EvaDB to an existing ``PostgreSQL`` server: + +.. tab-set:: + + .. tab-item:: Python + + .. code-block:: python + + params = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb", + } + query = f"CREATE DATABASE postgres_data + WITH ENGINE = 'postgres', + PARAMETERS = {params};" + cursor.query(query).df() + + .. tab-item:: SQL + + .. code-block:: text + + CREATE DATABASE postgres_data + WITH ENGINE = 'postgres', + PARAMETERS = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb" + } diff --git a/docs/source/usecases/emotion-analysis.rst b/docs/source/usecases/emotion-analysis.rst new file mode 100644 index 0000000000..f4708d2794 --- /dev/null +++ b/docs/source/usecases/emotion-analysis.rst @@ -0,0 +1,107 @@ +.. _emotion-analysis: + +Emotion Analysis +================ + +.. raw:: html + + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ + +Introduction +------------ + +In this tutorial, we present how to use ``PyTorch`` models in EvaDB to detect faces and classify their emotions. In particular, we focus on detecting faces in a person's video and classifying their emotions. EvaDB makes it easy to do face detection and emotion classification using its built-in support for ``PyTorch`` models. + +In this tutorial, we will showcase an usecase where we ``chain`` the outputs of two AI models in a single query. After detecting faces, we will ``crop`` the bounding box of the detected face and send it to an ``EmotionDetector`` function. + +.. include:: ../shared/evadb.rst + +We will assume that the input ``defhappy`` video is loaded into ``EvaDB``. To download the video and load it into ``EvaDB``, see the complete `emotion analysis notebook on Colab `_. + +Create Face and Emotion Detection Functions +------------------------------------------- + +To create custom ``FaceDetector`` and ``EmotionDetector`` functions, use the ``CREATE FUNCTION`` statement. In these queries, we leverage EvaDB's built-in support for custom models. We will assume that the files containing these functions are downloaded and stored locally. Now, run the following queries to register these functions: + +.. code-block:: sql + + CREATE UDF IF NOT EXISTS FaceDetector + INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) + OUTPUT (bboxes NDARRAY FLOAT32(ANYDIM, 4), + scores NDARRAY FLOAT32(ANYDIM)) + TYPE FaceDetection + IMPL 'face_detector.py'; + + CREATE FUNCTION IF NOT EXISTS EmotionDetector + INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) + OUTPUT (labels NDARRAY STR(ANYDIM), + scores NDARRAY FLOAT32(ANYDIM)) + TYPE Classification + IMPL 'emotion_detector.py'; + +The ``FaceDetector`` function takes a ``frame`` as input (``NDARRAY`` type) and returns bounding boxes (``bboxes``) of detected faces along with corresponding confidence scores (``scores``). + +The ``EmotionDetector`` function takes a ``frame`` as input (``NDARRAY`` type) and returns a label along with the corresponding confidence score. + +Emotion Analysis Queries +------------------------ + +After the functions are registered in ``EvaDB``, you can use them in subsequent SQL queries in different ways. + +In the following query, we call the ``FaceDetector`` function on every image in the video. The output of the function is stored in the ``bboxes`` and ``scores`` columns of the output ``DataFrame``. + +.. code-block:: sql + + SELECT id, FaceDetector(data) + FROM HAPPY + WHERE id < 10; + +This query returns the faces detected in the first ten frames of the video: + +.. code-block:: + + +-----------------------------------------------------------------------------------------------------+ + | objectdetectionvideos.id | yolo.labels | + +--------------------------+-----------------------------------------------------------------+ + | 0 | [car, car, car, car, car, car, person, car, ... | + | 1 | [car, car, car, car, car, car, car, car, car, ... | + +-----------------------------------------------------------------------------------------------------+ + +Chaining Functions in a Single AI Query +--------------------------------------- + +In the following query, we use the output of the ``FaceDetector`` to crop the detected face from the frame and send it to the ``EmotionDetector`` to identify the emotion in that person's face. Here, ``Crop`` is a built-in function in EvaDB that is used for cropping the given bounding box (``bbox``) from the given frame (``data``). + +We use ``LATERAL JOIN`` clause in the query to map the output of the ``FaceDetector`` to each frame in the ``HAPPY`` video table. + +.. code-block:: sql + + SELECT id, bbox, EmotionDetector(Crop(data, bbox)) + FROM HAPPY + JOIN LATERAL UNNEST(FaceDetector(data)) AS Face(bbox, conf) + WHERE id < 15; + +Now, the ``DataFrame`` only contains the emotions of the detected faces: + +.. code-block:: + + +------------------------------+ + | objectdetectionvideos.label | + |------------------------------| + | 6 | + | 6 | + +------------------------------+ + +.. include:: ../shared/footer.rst diff --git a/docs/source/usecases/image-classification.rst b/docs/source/usecases/image-classification.rst index 577e7f8eca..5f4a4aad5a 100644 --- a/docs/source/usecases/image-classification.rst +++ b/docs/source/usecases/image-classification.rst @@ -1,54 +1,61 @@ -Image Classification Pipeline using EvaDB -==== +.. _image-classification: -Assume the database has loaded a video ``mnist_video``. +Image Classification +==================== -1. Connect to EvaDB ----- +.. raw:: html -.. code-block:: python + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ - import evadb - cursor = evadb.connect().cursor() +Introduction +------------ -2. Register Image Classification Model as a Function in SQL ----- +In this tutorial, we present how to use ``PyTorch`` models in EvaDB to classify images. In particular, we focus on classifying images from the ``MNIST`` dataset that contains ``digits``. EvaDB makes it easy to do image classification using its built-in support for ``PyTorch`` models. -Create an image classification function from python source code. +In this tutorial, besides classifying images, we will also showcase a query where the model's output is used to retrieve images with the digit ``6``. -.. code-block:: python +.. include:: ../shared/evadb.rst - query = cursor.query(""" - CREATE UDF IF NOT EXISTS MnistImageClassifier - IMPL 'evadb/udfs/mnist_image_classifier.py'; - """).execute() +We will assume that the input ``MNIST`` video is loaded into ``EvaDB``. To download the video and load it into ``EvaDB``, see the complete `image classification notebook on Colab `_. -3. Execute Image Classification through SQL ----- +Create Image Classification Function +------------------------------------ -After the function is registered to EvaDB system, it can be directly called and used in SQL query. +To create a custom ``MnistImageClassifier`` function, use the ``CREATE FUNCTION`` statement. The code for the custom classification model is available `here `_. -.. tab-set:: - - .. tab-item:: Python +We will assume that the file is downloaded and stored as ``mnist_image_classifier.py``. Now, run the following query to register the AI function: - .. code-block:: python +.. code-block:: sql - query = cursor.table("mnist_video").select("MnistImageClassifier(data).label") - - # Get results in a DataFrame. - query.df() + CREATE FUNCTION + IF NOT EXISTS MnistImageClassifier + IMPL 'mnist_image_classifier.py'; +Image Classification Queries +---------------------------- - .. tab-item:: SQL +After the function is registered in ``EvaDB``, you can use it subsequent SQL queries in different ways. - .. code-block:: sql +In the following query, we call the classifier on every image in the video. The output of the function is stored in the ``label`` column (i.e., the digit associated with the given frame) of the output ``DataFrame``. - SELECT MnistImageClassifier(data).label FROM mnist_video; +.. code-block:: sql - + SELECT MnistImageClassifier(data).label + FROM mnist_video; -The result contains a projected ``label`` column, which indicates the digit of a particular frame. +This query returns the label of all the images: .. code-block:: @@ -57,42 +64,27 @@ The result contains a projected ``label`` column, which indicates the digit of a |------------------------------| | 6 | | 6 | - | 6 | - | 6 | - | 6 | - | 6 | + | ... | + | ... | + | ... | + | ... | | 4 | | 4 | + +------------------------------+ - ... ... - -4. Optional: Process Only Segments of Videos based on Conditions ----- - -Like normal SQL, you can also specify conditions to filter out some frames of the video. - -.. tab-set:: - - .. tab-item:: Python - - .. code-block:: python - - query = cursor.table("mnist_video") \ - .filter("id < 2") \ - .select("MnistImageClassifier(data).label") - - # Return results in a DataFrame. - query.df() - .. tab-item:: SQL +Filtering Based on AI Function +------------------------------ - .. code-block:: sql +In the following query, we use the output of the classifier to retrieve a subset of images that contain a particular digit (e.g., ``6``). - SELECT MnistImageClassifier(data).label FROM mnist_video - WHERE id < 2 +.. code-block:: sql + SELECT id, MnistImageClassifier(data).label + FROM mnist_video + WHERE MnistImageClassifier(data).label = '6'; -Now, the ``DataFrame`` only contains 2 rows after filtering. +Now, the ``DataFrame`` only contains images of the digit ``6``. .. code-block:: @@ -103,4 +95,4 @@ Now, the ``DataFrame`` only contains 2 rows after filtering. | 6 | +------------------------------+ -Check out our `Jupyter Notebook `_ for working example. \ No newline at end of file +.. include:: ../shared/footer.rst \ No newline at end of file diff --git a/docs/source/usecases/image-search.rst b/docs/source/usecases/image-search.rst new file mode 100644 index 0000000000..b30ef1100b --- /dev/null +++ b/docs/source/usecases/image-search.rst @@ -0,0 +1,137 @@ +.. _image-search: + +Image Search +============ + +.. raw:: html + + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ + +Introduction +------------ + +In this tutorial, we present how to use classical vision models (i.e., ``SIFT feature extractor``) in EvaDB to search for similar images powered by a ``vector index``. In particular, we focus on retrieving similar images from the ``Reddit`` dataset that contain similar ``motifs``. EvaDB makes it easy to do image search using its built-in support for vision models and vector database systems (e.g., ``FAISS``). + +.. include:: ../shared/evadb.rst + +We will assume that the input ``Reddit`` image collection is loaded into ``EvaDB``. To download this image dataset and load it into ``EvaDB``, see the complete `image search notebook on Colab `_. + +Create Image Feature Extraction Function +---------------------------------------- + +To create a custom ``SiftFeatureExtractor`` function, use the ``CREATE FUNCTION`` statement. We will assume that the file is downloaded and stored as ``sift_feature_extractor.py``. Now, run the following query to register this function: + +.. tab-set:: + + .. tab-item:: Python + + .. code-block:: python + + cursor.query(""" + CREATE FUNCTION + IF NOT EXISTS SiftFeatureExtractor + IMPL 'evadb/udfs/sift_feature_extractor.py' + """).df() + + .. tab-item:: SQL + + .. code-block:: sql + + CREATE FUNCTION + IF NOT EXISTS SiftFeatureExtractor + IMPL 'evadb/udfs/sift_feature_extractor.py' + + +Create Vector Index for Similar Image Search +-------------------------------------------- + +To locate images with similar appearance, we next create an index based on the feature vectors returned by ``SiftFeatureExtractor`` on the loaded images. EvaDB will later use this vector index to quickly returns similar images. + +EvaDB lets you connect to your favorite vector database via the ``CREATE INDEX`` statement. In this query, we will create a new index using the ``FAISS`` vector index framework from ``Meta``. + +The following EvaQL statement creates a vector index on the ``SiftFeatureExtractor(data)`` column in the ``reddit_dataset`` table: + +.. tab-set:: + + .. tab-item:: Python + + .. code-block:: python + + cursor.query(""" + CREATE INDEX reddit_sift_image_index + ON reddit_dataset (SiftFeatureExtractor(data)) + USING FAISS; + """).df() + + .. tab-item:: SQL + + .. code-block:: sql + + CREATE INDEX reddit_sift_image_index + ON reddit_dataset (SiftFeatureExtractor(data)) + USING FAISS; + +Similar Image Search Powered By Vector Index +-------------------------------------------- + +EvaQL supports the ``ORDER BY`` and ``LIMIT`` clauses to retrieve the ``top-k`` most similar images for a given image. + +EvaDB contains a built-in ``Similarity(x, y)`` function that computets the Euclidean distance between ``x`` and ``y``. We will use this function to compare the feature vector of image being search (i.e., the given image) and the feature vectors of all the images in the dataset that is stored in the vector index. + +EvaDB's query optimizer automatically picks the correct vector index to accelerate a given EvaQL query. It uses the vector index created in the prior step to accelerate the following image search query: + +.. tab-set:: + + .. tab-item:: Python + + .. code-block:: python + + query = cursor.query(""" + SELECT name FROM reddit_dataset ORDER BY + Similarity( + SiftFeatureExtractor(Open('reddit-images/g1074_d4mxztt.jpg')), + SiftFeatureExtractor(data) + ) + LIMIT 5 + """).df() + + .. tab-item:: SQL + + .. code-block:: sql + + SELECT name FROM reddit_dataset ORDER BY + Similarity( + SiftFeatureExtractor(Open('reddit-images/g1074_d4mxztt.jpg')), + SiftFeatureExtractor(data) + ) + LIMIT 5 + +.. code-block:: python + +This query returns the top-5 most similar images in a ``DataFrame``: + +.. code-block:: + + +---------------------------------+ + | reddit_dataset.name | + |---------------------------------| + | reddit-images/g1074_d4mxztt.jpg | + | reddit-images/g348_d7ju7dq.jpg | + | reddit-images/g1209_ct6bf1n.jpg | + | reddit-images/g1190_cln9xzr.jpg | + | reddit-images/g1190_clna2x2.jpg | + +---------------------------------+ + +.. include:: ../shared/footer.rst \ No newline at end of file diff --git a/docs/source/usecases/object-detection.rst b/docs/source/usecases/object-detection.rst new file mode 100644 index 0000000000..d29924d3d0 --- /dev/null +++ b/docs/source/usecases/object-detection.rst @@ -0,0 +1,91 @@ +.. _object-detection: + +Object Detection +================ + +.. raw:: html + + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ + +Introduction +------------ + +In this tutorial, we present how to use ``YOLO`` models in EvaDB to detect objects. In particular, we focus on detecting objects from the challenging, real-world ``UA-DETRAC`` dataset. EvaDB makes it easy to do object detection using its built-in support for ``YOLO`` models. + +In this tutorial, besides detecting objects, we will also showcase a query where the model's output is used to retrieve a subset of frames with ``pedestrian`` and ``car`` objects. + +.. include:: ../shared/evadb.rst + +We will assume that the input ``UA-DETRAC`` video is loaded into ``EvaDB``. To download the video and load it into ``EvaDB``, see the complete `object detection notebook on Colab `_. + +Create Object Detection Function +-------------------------------- + +To create a custom ``Yolo`` function based on the popular ``YOLO-v8m`` model, use the ``CREATE FUNCTION`` statement. In this query, we leverage EvaDB's built-in support for ``ultralytics`` models. We only need to specify the ``model`` parameter in the query to create this function: + +.. code-block:: sql + + CREATE UDF IF NOT EXISTS Yolo + TYPE ultralytics + MODEL 'yolov8m.pt'; + +Object Detection Queries +------------------------ + +After the function is registered in ``EvaDB``, you can use it subsequent SQL queries in different ways. + +In the following query, we call the object detector on every image in the video. The output of the function is stored in the ``label`` column (i.e., the digit associated with the given frame) of the output ``DataFrame``. + +.. code-block:: sql + + SELECT id, Yolo(data) + FROM ObjectDetectionVideos + WHERE id < 20 + LIMIT 5; + +This query returns the label of all the images: + +.. code-block:: + + +-----------------------------------------------------------------------------------------------------+ + | objectdetectionvideos.id | yolo.labels | + +--------------------------+-----------------------------------------------------------------+ + | 0 | [car, car, car, car, car, car, person, car, ... | + | 1 | [car, car, car, car, car, car, car, car, car, ... | + +-----------------------------------------------------------------------------------------------------+ + +Filtering Based on YOLO Function +-------------------------------- + +In the following query, we use the output of the object detector to retrieve a subset of frames that contain a ``pedestrian`` and a ``car``. + +.. code-block:: sql + + SELECT id + FROM ObjectDetectionVideos + WHERE ['pedestrian', 'car'] <@ Yolo(data).label; + +Now, the ``DataFrame`` only contains frames with the desired objects: + +.. code-block:: + + +------------------------------+ + | objectdetectionvideos.label | + |------------------------------| + | 6 | + | 6 | + +------------------------------+ + +.. include:: ../shared/footer.rst diff --git a/docs/source/usecases/privategpt.rst b/docs/source/usecases/privategpt.rst new file mode 100644 index 0000000000..767cc8895e --- /dev/null +++ b/docs/source/usecases/privategpt.rst @@ -0,0 +1,2 @@ +PrivateGPT +========== diff --git a/docs/source/usecases/qa-video.rst b/docs/source/usecases/question-answering.rst similarity index 58% rename from docs/source/usecases/qa-video.rst rename to docs/source/usecases/question-answering.rst index 3145ca85b1..d6692804ab 100644 --- a/docs/source/usecases/qa-video.rst +++ b/docs/source/usecases/question-answering.rst @@ -1,62 +1,85 @@ -Q&A Application on Videos -==== +.. _question-answering: -1. Connect to EvaDB ----- +Question Answering +================== -.. code-block:: python +.. raw:: html - import evadb - cursor = evadb.connect().cursor() + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ -2. Register Functions ----- -Register speech-to-text **whisper** model from `HuggingFace` +Introduction +------------ -.. code-block:: python +In this tutorial, we present how to use ``HuggingFace`` and ``OpenAI`` models in EvaDB to answer questions based on videos. In particular, we will first convert the speech component of the video to text using the ``HuggingFace`` model. The generated transcript is stored in a table as a ``text`` column for subsequent analysis. We then use an ``OpenAI`` model to answer questions based on the ``text`` column. + +EvaDB makes it easy to answer questions based on videos using its built-in support for ``HuggingFace`` and ``OpenAI`` models. + +.. include:: ../shared/evadb.rst + +We will assume that the input ``ukraine_video`` video is loaded into ``EvaDB``. To download the video and load it into ``EvaDB``, see the complete `question answering notebook on Colab `_. - cursor.query(""" - CREATE UDF SpeechRecognizer - TYPE HuggingFace - 'task' 'automatic-speech-recognition' - 'model' 'openai/whisper-base'; - """).execute() + +Create Speech Recognition Function +---------------------------------- + +To create a custom ``SpeechRecognizer`` function based on the popular ``Whisper`` model, use the ``CREATE FUNCTION`` statement. In this query, we leverage EvaDB's built-in support for ``HuggingFace`` models. We only need to specify the ``task`` and the ``model`` parameters in the query to create this function: + +.. code-block:: sql + + CREATE FUNCTION SpeechRecognizer + TYPE HuggingFace + TASK 'automatic-speech-recognition' + MODEL 'openai/whisper-base'; .. note:: - EvaDB allows users to register any model in HuggingFace as a function. + EvaDB has built-in support for a wide range of :ref:`HuggingFace` models. -Register **OpenAI** LLM model +Create ChatGPT Function +------------------------ -.. code-block:: python +EvaDB has built-in support for ``ChatGPT`` function from ``OpenAI``. You will need to configure the ``OpenAI`` key in the environment as shown below: - cursor.query(""" - CREATE UDF ChatGPT - IMPL 'evadb/udfs/chatgpt.py' - """).execute() +.. code-block:: python - # Set OpenAI token + # Set OpenAI key import os os.environ["OPENAI_KEY"] = "sk-..." .. note:: + EvaDB has built-in support for a wide range of :ref:`OpenAI` models. You can also switch to another large language models that runs locally by defining a :ref:`Custom function`. + + ChatGPT function is a wrapper around OpenAI API call. You can also switch to other LLM models that can run locally. -3. Summarize Video in Text ----- +Convert Speech to Text +---------------------- -Create a table with text summary of the video. Text summarization is generated by running speech-to-text ``Whisper`` model from ``HuggingFace``. +After registering the ``SpeechRecognizer`` function, we run it over the video to obtain the video's transcript. EvaDB supports direct reference to the ``audio`` component of the video as shown in this query: -.. code-block:: python +.. code-block:: sql - cursor.query(""" CREATE TABLE text_summary AS - SELECT SpeechRecognizer(audio) FROM ukraine_video; - """).execute() + SELECT SpeechRecognizer(audio) + FROM ukraine_video; + +Here, the ``SpeechRecognizer`` function is applied on the ``audio`` component of the ``ukraine_video`` video loaded into EvaDB. The output of the ``SpeechRecognizer`` function is stored in the ``text`` column of the ``text_summary`` table. -This results a table shown below. +Here is the query's output ``DataFrame``: .. code-block:: @@ -64,21 +87,21 @@ This results a table shown below. | text_summary.text | |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | The war in Ukraine has been on for 415 days. Who is winning it? Not Russia. Certainly not Ukraine. It is the US oil companies. US oil companies have reached $200 billion in pure profits. The earnings are still on. They are still milking this war and sharing the spoils. Let us look at how Exxon mobile has been doing. In 2022, the company made $56 billion in profits. Oil companies capitalized on instability and they are profiting from pain. American oil companies are masters of this art. You may remember the war in Iraq. The US went to war in Iraq by selling a lie. The Americans did not find any weapons of mass destruction but they did find lots of oil. And in the year since, American officials have admitted this. And this story is not over. It's repeating itself in Ukraine. They are feeding another war and filling the coffers of US oil companies. | - +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -4. Q&A using ChatGPT ----- +Question Answering using ChatGPT +-------------------------------- -We can now embed the ChatGPT prompt inside SQL with text summary from the table as its knowledge base. +We next run a EvaQL query that uses the ``ChatGPT`` function on the ``text`` column to answer questions based on the video. The ``text`` column serves as important ``context`` for the large language model. This query checks if the video is related to the war between Ukraine and Russia. -.. code-block:: python +.. code-block:: sql - cursor.query(""" - SELECT ChatGPT('Is this video summary related to Ukraine russia war', text) + SELECT ChatGPT( + 'Is this video summary related to Ukraine russia war', + text) FROM text_summary; - """).df() -This query returns a projected ``DataFrame``. +Here is the query's output ``DataFrame``: .. code-block:: @@ -86,4 +109,8 @@ This query returns a projected ``DataFrame``. | chatgpt.response | |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Based on the provided context, it seems that the video summary is related to the Ukraine-Russia war. It discusses how US oil companies are allegedly profiting from the war in Ukraine, similar to how they allegedly benefited from the war in Iraq. | - +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +.. include:: ../shared/nlp.rst + +.. include:: ../shared/footer.rst diff --git a/docs/source/usecases/sentiment-analysis.rst b/docs/source/usecases/sentiment-analysis.rst new file mode 100644 index 0000000000..e45d2d236a --- /dev/null +++ b/docs/source/usecases/sentiment-analysis.rst @@ -0,0 +1,86 @@ +.. _sentiment-analysis: + +Sentiment Analysis +================== + +.. raw:: html + + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ + +Introduction +------------ + +In this tutorial, we present how to use OpenAI models in EvaDB to analyse sentiment in text data. In particular, we focus on analysing sentiments expressed by customers in food reviews. EvaDB makes it easy to do sentiment analysis using its built-in ChatGPT AI function. In this tutorial, besides classifying sentiment, we will also use another query to generate responses to customers for addressing ``negative`` reviews. + +We will assume that the input data is loaded into a ``PostgreSQL`` database. +To load the food review data into your database, see the complete `sentiment analysis notebook on Colab `_. + +.. include:: ../shared/evadb.rst + +.. include:: ../shared/postgresql.rst + +Sentiment Analysis of Reviews using ChatGPT +------------------------------------------- + +We run the following query to analyze whether the review is ``positive`` or ``negative`` with a custom ChatGPT prompt. Here, the query runs on the ``review`` column in the ``review_table`` that is a part of the ``PostgreSQL`` database. + +.. code-block:: sql + + SELECT ChatGPT( + "Is the review positive or negative? Only reply 'positive' or 'negative'. Here are examples. The food is very bad: negative. The food is very good: positive.", + review) + FROM postgres_data.review_table; + +This query returns the sentiment of the reviews in the table: + +.. code-block:: + + +------------------------------+ + | chatgpt.response | + |------------------------------| + | negative | + | positive | + | negative | + +------------------------------+ + +Respond to Negative reviews using ChatGPT +----------------------------------------- + +Let's next respond to negative food reviews using another EvaQL query that first retrieves the reviews with ``negative`` sentiment, and processes those reviews with another ChatGPT function call that generates a response to address the concerns shared in the review. + +.. code-block:: sql + + SELECT ChatGPT( + "Respond the the review with solution to address the review's concern", + review) + FROM postgres_data.review_table + WHERE ChatGPT( + "Is the review positive or negative. Only reply 'positive' or 'negative'. Here are examples. The food is very bad: negative. The food is very good: positive.", + review) = "negative"; + +While running this query, EvaDB first retrieves the negative reviews and then applies ChatGPT to derive a response. Here is the query's output ``DataFrame``: + +.. code-block:: + + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | chatgpt.response | + |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | Dear valued customer, Thank you for bringing this matter to our attention. We apologize for the inconvenience caused by the excessive saltiness of your fried rice. We understand how important it is to have a satisfying dining experience, and we would like to make it right for you ... | + | Dear [Customer's Name], Thank you for bringing this issue to our attention. We apologize for the inconvenience caused by the missing chicken sandwich in your takeout order. We understand how frustrating it can be when an item is missing from your meal. To address this concern, we ... | + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +.. include:: ../shared/nlp.rst + +.. include:: ../shared/footer.rst diff --git a/docs/source/usecases/similar-image-search.rst b/docs/source/usecases/similar-image-search.rst deleted file mode 100644 index 83f69e395b..0000000000 --- a/docs/source/usecases/similar-image-search.rst +++ /dev/null @@ -1,83 +0,0 @@ -Image Similarity Search Pipeline using EvaDB on Images -==== - -In this use case, we want to search similar images based on an image provided by the user. To implement this use case, we leverage EvaDB's capability of easily expressing feature extraction pipeline. Additionaly, we also leverage EvaDB's capability of building a similarity search index and searching the index to -locate similar images through ``FAISS`` library. - -For this use case, we use a reddit image dataset that can be downloaded from `Here `_. -We populate a table in the database that contains all images. - -1. Connect to EvaDB ----- - -.. code-block:: python - - import evadb - cursor = evadb.connect().cursor() - -2. Register SIFT as Function ----- - -.. code-block:: python - - cursor.query(""" - CREATE UDF IF NOT EXISTS SiftFeatureExtractor - IMPL 'evadb/udfs/sift_feature_extractor.py' - """).execute() - -3. Search Similar Images ----- - -To locate images that have similar appearance, we will first build an index based on embeddings of images. -Then, for the given image, EvaDB can find similar images by searching in the index. - -Build Index using ``FAISS`` -**** - -The below query creates a new index on the projected column ``SiftFeatureExtractor(data)`` from the ``reddit_dataset`` table. - -.. code-block:: python - - cursor.query(""" - CREATE INDEX reddit_sift_image_index - ON reddit_dataset (SiftFeatureExtractor(data)) - USING FAISS - """).execute() - -Search Index for a Given Image -**** - -EvaDB leverages the ``ORDER BY ... LIMIT ...`` SQL syntax to retrieve the top 5 similar images. -In this example, ``Similarity(x, y)`` is a built-in function to calculate distance between ``x`` and ``y``. -In current version, ``x`` is a single tuple and ``y`` is a column that contains multiple tuples. -By default EvaDB does pairwise distance calculation between ``x`` and all tuples from ``y``. -In this case, EvaDB leverages the index that we have already built. - -.. code-block:: python - - query = cursor.query(""" - SELECT name FROM reddit_dataset ORDER BY - Similarity( - SiftFeatureExtractor(Open('reddit-images/g1074_d4mxztt.jpg')), - SiftFeatureExtractor(data) - ) - LIMIT 5 - """) - query.df() - -The ``DataFrame`` contains the top 5 similar images. - -.. code-block:: - - +---------------------------------+ - | reddit_dataset.name | - |---------------------------------| - | reddit-images/g1074_d4mxztt.jpg | - | reddit-images/g348_d7ju7dq.jpg | - | reddit-images/g1209_ct6bf1n.jpg | - | reddit-images/g1190_cln9xzr.jpg | - | reddit-images/g1190_clna2x2.jpg | - +---------------------------------+ - -Check out our `Jupyter Notebook `_ for working example. -We also demonstrate more complicated features of EvaDB for similarity search. \ No newline at end of file diff --git a/docs/source/usecases/text-summarization.rst b/docs/source/usecases/text-summarization.rst new file mode 100644 index 0000000000..e0267b8836 --- /dev/null +++ b/docs/source/usecases/text-summarization.rst @@ -0,0 +1,88 @@ +.. _text-summarization: + +Text Summarization +================== + +.. raw:: html + + + + + + +
+ Run on Google Colab + + View source on GitHub + + Download notebook +


+ + + +Introduction +------------ + +In this tutorial, we present how to use ``HuggingFace`` models in EvaDB to summarize and classify text. In particular, we will first load ``PDF`` documents into ``EvaDB`` using the ``LOAD PDF`` statement. The text in each paragraph from the PDF is automatically stored in the ``data`` column of a table for subsequent analysis. We then run text summarization and text classification AI queries on the ``data`` column obtained from the loaded ``PDF`` documents. + +EvaDB makes it easy to process text using its built-in support for ``HuggingFace``. + +.. include:: ../shared/evadb.rst + +We will assume that the input ``pdf_sample1`` PDF is loaded into ``EvaDB``. To download the PDF and load it into ``EvaDB``, see the complete `text summarization notebook on Colab `_. + + +Create Text Summarization and Classification Functions +------------------------------------------------------ + +To create custom ``TextSummarizer`` and ``TextClassifier`` functions, use the ``CREATE FUNCTION`` statement. In these queries, we leverage EvaDB's built-in support for ``HuggingFace`` models. We only need to specify the ``task`` and the ``model`` parameters in the query to create these functions: + +.. code-block:: sql + + CREATE FUNCTION IF NOT EXISTS TextSummarizer + TYPE HuggingFace + TASK 'summarization' + MODEL 'facebook/bart-large-cnn'; + + CREATE FUNCTION IF NOT EXISTS TextClassifier + TYPE HuggingFace + TASK 'text-classification' + MODEL 'distilbert-base-uncased-finetuned-sst-2-english'; + +.. note:: + + EvaDB has built-in support for a wide range of :ref:`HuggingFace` models. + +AI Query Using Registered Functions +----------------------------------- + +After registering these two functions, we use them in a single AI query over the ``data`` column to retrieve a subset of paragraphs from the loaded ``PDF`` documents with `negative` sentiment: + +.. code-block:: sql + + CREATE TABLE text_summary AS + SELECT data, TextSummarizer(data) + FROM MyPDFs + WHERE page = 1 + AND paragraph >= 1 AND paragraph <= 3 + AND TextClassifier(data).label = 'NEGATIVE'; + +Here, the ``TextClassifier`` function is applied on the ``data`` column of the ``pdf_sample1`` PDF loaded into EvaDB and its output is used to filter out a subset of paragraphs with `negative sentiment`. + +EvaDB's query optimizer automatically applies the earlier predicates on page number and paragraph numbers to (e.g., ``page = 1``) to avoid running the expensive ``TextClassifier`` function on all the rows in the table. After filtering out a subset of paragraphs, EvaDB applies the ``TextSummarizer`` function to derive their summaries. + +Here is the query's output ``DataFrame``: + +.. code-block:: + + +--------------------------------------------------------+--------------------------------------------------------+ + | mypdfs.data | mypdfs.summary_text | + +--------------------------------------------------------+--------------------------------------------------------+ + | DEFINATION  Specialized connective tissue wit... | Specialized connective tissue with fluid matri... | + | PHYSICAL CHARACTERISTICS ( 1 ) COLOUR -- R... | The temperature is 38° C / 100.4° F. The body ... | + +--------------------------------------------------------+--------------------------------------------------------+ + + +.. include:: ../shared/nlp.rst + +.. include:: ../shared/footer.rst diff --git a/evadb/README.md b/evadb/README.md index a2b48f8c8e..5032db88e5 100644 --- a/evadb/README.md +++ b/evadb/README.md @@ -10,6 +10,6 @@ * SELECT statements are expanded to different operators PROJECT and FILTER, etc. * `optimizer / plan_generator.py` - Generation of query plan. * All operators in the tree are converted to group expression for optimization(`optimizer / group_expression.py`). -* The optimizer rewrites the query based on a collection of UDF-centric rules(`optimizer / rules`). +* The optimizer rewrites the query based on a collection of function-centric rules(`optimizer / rules`). * `executor` - The execution engine runs the physical plan generated by the optimizer. * `storage` - This component manages both structured (using `sqlalchemy`) and unstructured data on disk. diff --git a/evadb/binder/binder_utils.py b/evadb/binder/binder_utils.py index d6e299c673..bb1b36edbd 100644 --- a/evadb/binder/binder_utils.py +++ b/evadb/binder/binder_utils.py @@ -17,6 +17,8 @@ import re from typing import TYPE_CHECKING, List +import pandas as pd + from evadb.catalog.catalog_type import ColumnType, TableType from evadb.catalog.catalog_utils import ( get_video_table_column_definitions, @@ -70,7 +72,7 @@ def check_data_source_and_table_are_valid( logger.error(error) raise BinderError(error) - # Check table existance. + # Check table existence. table_df = resp.data if table_name not in table_df["table_name"].values: error = "Table {} does not exist in data source {}. Create the table using native query.".format( @@ -88,11 +90,16 @@ def check_data_source_and_table_are_valid( def create_table_catalog_entry_for_data_source( - table_name: str, column_name_list: List[str] + table_name: str, column_info: pd.DataFrame ): + column_name_list = list(column_info["name"]) + column_type_list = [ + ColumnType.python_type_to_evadb_type(dtype) + for dtype in list(column_info["dtype"]) + ] column_list = [] - for column_name in column_name_list: - column_list.append(ColumnCatalogEntry(column_name, ColumnType.ANY)) + for name, dtype in zip(column_name_list, column_type_list): + column_list.append(ColumnCatalogEntry(name, dtype)) # Assemble table. table_catalog_entry = TableCatalogEntry( @@ -133,7 +140,7 @@ def bind_native_table_info(catalog: CatalogManager, table_info: TableInfo): # Assemble columns. column_df = handler.get_columns(table_info.table_name).data table_info.table_obj = create_table_catalog_entry_for_data_source( - table_info.table_name, list(column_df["column_name"]) + table_info.table_name, column_df ) @@ -291,10 +298,10 @@ def handle_bind_extract_object_function( binder_context.bind(tracker) # append the bound output of detector for obj in detector.output_objs: - col_alias = "{}.{}".format(obj.udf_name.lower(), obj.name.lower()) + col_alias = "{}.{}".format(obj.function_name.lower(), obj.name.lower()) child = TupleValueExpression( obj.name, - table_alias=obj.udf_name.lower(), + table_alias=obj.function_name.lower(), col_object=obj, col_alias=col_alias, ) @@ -319,7 +326,7 @@ def get_column_definition_from_select_target_list( ) -> List[ColumnDefinition]: """ This function is used by CREATE TABLE AS (SELECT...) and - CREATE UDF FROM (SELECT ...) to get the output objs from the + CREATE FUNCTION FROM (SELECT ...) to get the output objs from the child SELECT statement. """ binded_col_list = [] @@ -345,7 +352,7 @@ def drop_row_id_from_target_list( target_list: List[AbstractExpression], ) -> List[AbstractExpression]: """ - This function is intended to be used by CREATE UDF FROM (SELECT * FROM ...) and CREATE TABLE AS SELECT * FROM ... to exclude the row_id column. + This function is intended to be used by CREATE FUNCTION FROM (SELECT * FROM ...) and CREATE TABLE AS SELECT * FROM ... to exclude the row_id column. """ filtered_list = [] for expr in target_list: diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index 2d578caa58..002ea0cfb2 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -35,18 +35,18 @@ from evadb.expression.abstract_expression import AbstractExpression, ExpressionType from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression +from evadb.parser.create_function_statement import CreateFunctionStatement from evadb.parser.create_index_statement import CreateIndexStatement from evadb.parser.create_statement import CreateTableStatement -from evadb.parser.create_udf_statement import CreateUDFStatement from evadb.parser.delete_statement import DeleteTableStatement from evadb.parser.explain_statement import ExplainStatement from evadb.parser.rename_statement import RenameTableStatement from evadb.parser.select_statement import SelectStatement from evadb.parser.statement import AbstractStatement from evadb.parser.table_ref import TableRef -from evadb.parser.types import UDFType -from evadb.third_party.huggingface.binder import assign_hf_udf -from evadb.utils.generic_utils import load_udf_class_from_file +from evadb.parser.types import FunctionType +from evadb.third_party.huggingface.binder import assign_hf_function +from evadb.utils.generic_utils import load_function_class_from_file from evadb.utils.logging_manager import logger @@ -72,8 +72,8 @@ def _bind_abstract_expr(self, node: AbstractExpression): def _bind_explain_statement(self, node: ExplainStatement): self.bind(node.explainable_stmt) - @bind.register(CreateUDFStatement) - def _bind_create_udf_statement(self, node: CreateUDFStatement): + @bind.register(CreateFunctionStatement) + def _bind_create_function_statement(self, node: CreateFunctionStatement): if node.query is not None: self.bind(node.query) # Drop the automatically generated _row_id column @@ -86,33 +86,36 @@ def _bind_create_udf_statement(self, node: CreateUDFStatement): arg_map = {key: value for key, value in node.metadata} assert ( "predict" in arg_map - ), f"Creating {node.udf_type} UDFs expects 'predict' metadata." + ), f"Creating {node.function_type} functions expects 'predict' metadata." # We only support a single predict column for now predict_columns = set([arg_map["predict"]]) inputs, outputs = [], [] for column in all_column_list: if column.name in predict_columns: - column.name = column.name + "_predictions" + if node.function_type != "Forecasting": + column.name = column.name + "_predictions" + else: + column.name = column.name outputs.append(column) else: inputs.append(column) assert ( len(node.inputs) == 0 and len(node.outputs) == 0 - ), f"{node.udf_type} UDFs' input and output are auto assigned" + ), f"{node.function_type} functions' input and output are auto assigned" node.inputs, node.outputs = inputs, outputs @bind.register(CreateIndexStatement) def _bind_create_index_statement(self, node: CreateIndexStatement): self.bind(node.table_ref) - if node.udf_func: - self.bind(node.udf_func) + if node.function: + self.bind(node.function) # TODO: create index currently only supports single numpy column. assert len(node.col_list) == 1, "Index cannot be created on more than 1 column" # TODO: create index currently only works on TableInfo, but will extend later. assert node.table_ref.is_table_atom(), "Index can only be created on Tableinfo" - if not node.udf_func: + if not node.function: # Feature table type needs to be float32 numpy array. assert ( len(node.col_list) == 1 @@ -133,9 +136,11 @@ def _bind_create_index_statement(self, node: CreateIndexStatement): ), "Index input needs to be float32." assert len(col.array_dimensions) == 2 else: - # Output of the UDF should be 2 dimension and float32 type. - udf_obj = self._catalog().get_udf_catalog_entry_by_name(node.udf_func.name) - for output in udf_obj.outputs: + # Output of the function should be 2 dimension and float32 type. + function_obj = self._catalog().get_function_catalog_entry_by_name( + node.function.name + ) + for output in function_obj.outputs: assert ( output.array_type == NdArrayType.FLOAT32 ), "Index input needs to be float32." @@ -269,7 +274,7 @@ def _bind_tuple_expr(self, node: TupleValueExpression): @bind.register(FunctionExpression) def _bind_func_expr(self, node: FunctionExpression): # handle the special case of "extract_object" - if node.name.upper() == str(UDFType.EXTRACT_OBJECT): + if node.name.upper() == str(FunctionType.EXTRACT_OBJECT): handle_bind_extract_object_function(node, self) return @@ -284,68 +289,80 @@ def _bind_func_expr(self, node: FunctionExpression): for child in node.children: self.bind(child) - udf_obj = self._catalog().get_udf_catalog_entry_by_name(node.name) - if udf_obj is None: + function_obj = self._catalog().get_function_catalog_entry_by_name(node.name) + if function_obj is None: err_msg = ( f"Function '{node.name}' does not exist in the catalog. " - "Please create the function using CREATE UDF command." + "Please create the function using CREATE FUNCTION command." ) logger.error(err_msg) raise BinderError(err_msg) - if udf_obj.type == "HuggingFace": - node.function = assign_hf_udf(udf_obj) + if function_obj.type == "HuggingFace": + node.function = assign_hf_function(function_obj) - elif udf_obj.type == "Ludwig": - udf_class = load_udf_class_from_file( - udf_obj.impl_file_path, + elif function_obj.type == "Ludwig": + function_class = load_function_class_from_file( + function_obj.impl_file_path, "GenericLudwigModel", ) - udf_metadata = get_metadata_properties(udf_obj) - assert "model_path" in udf_metadata, "Ludwig models expect 'model_path'." - node.function = lambda: udf_class(model_path=udf_metadata["model_path"]) + function_metadata = get_metadata_properties(function_obj) + assert ( + "model_path" in function_metadata + ), "Ludwig models expect 'model_path'." + node.function = lambda: function_class( + model_path=function_metadata["model_path"] + ) else: - if udf_obj.type == "ultralytics": - # manually set the impl_path for yolo udfs we only handle object + if function_obj.type == "ultralytics": + # manually set the impl_path for yolo functions we only handle object # detection for now, hopefully this can be generalized - udf_dir = Path(EvaDB_INSTALLATION_DIR) / "udfs" - udf_obj.impl_file_path = ( - Path(f"{udf_dir}/yolo_object_detector.py").absolute().as_posix() + function_dir = Path(EvaDB_INSTALLATION_DIR) / "functions" + function_obj.impl_file_path = ( + Path(f"{function_dir}/yolo_object_detector.py") + .absolute() + .as_posix() ) - # Verify the consistency of the UDF. If the checksum of the UDF does not + # Verify the consistency of the function. If the checksum of the function does not # match the one stored in the catalog, an error will be thrown and the user - # will be asked to register the UDF again. + # will be asked to register the function again. # assert ( - # get_file_checksum(udf_obj.impl_file_path) == udf_obj.checksum - # ), f"""UDF file {udf_obj.impl_file_path} has been modified from the - # registration. Please use DROP UDF to drop it and re-create it # using CREATE UDF.""" + # get_file_checksum(function_obj.impl_file_path) == function_obj.checksum + # ), f"""Function file {function_obj.impl_file_path} has been modified from the + # registration. Please use DROP FUNCTION to drop it and re-create it # using CREATE FUNCTION.""" try: - udf_class = load_udf_class_from_file( - udf_obj.impl_file_path, - udf_obj.name, + function_class = load_function_class_from_file( + function_obj.impl_file_path, + function_obj.name, ) - # certain udfs take additional inputs like yolo needs the model_name + # certain functions take additional inputs like yolo needs the model_name # these arguments are passed by the user as part of metadata - node.function = lambda: udf_class(**get_metadata_properties(udf_obj)) + node.function = lambda: function_class( + **get_metadata_properties(function_obj) + ) except Exception as e: err_msg = ( - f"{str(e)}. Please verify that the UDF class name in the " - "implementation file matches the UDF name." + f"{str(e)}. Please verify that the function class name in the " + "implementation file matches the function name." ) logger.error(err_msg) raise BinderError(err_msg) - node.udf_obj = udf_obj - output_objs = self._catalog().get_udf_io_catalog_output_entries(udf_obj) + node.function_obj = function_obj + output_objs = self._catalog().get_function_io_catalog_output_entries( + function_obj + ) if node.output: for obj in output_objs: if obj.name.lower() == node.output: node.output_objs = [obj] if not node.output_objs: - err_msg = f"Output {node.output} does not exist for {udf_obj.name}." + err_msg = ( + f"Output {node.output} does not exist for {function_obj.name}." + ) logger.error(err_msg) raise BinderError(err_msg) node.projection_columns = [node.output] diff --git a/evadb/binder/statement_binder_context.py b/evadb/binder/statement_binder_context.py index 84ee310fe1..b1101a2b36 100644 --- a/evadb/binder/statement_binder_context.py +++ b/evadb/binder/statement_binder_context.py @@ -21,14 +21,14 @@ ) from evadb.catalog.catalog_type import TableType from evadb.catalog.models.column_catalog import ColumnCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry from evadb.catalog.models.table_catalog import TableCatalogEntry -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression from evadb.third_party.databases.interface import get_database_handler from evadb.utils.logging_manager import logger -CatalogColumnType = Union[ColumnCatalogEntry, UdfIOCatalogEntry] +CatalogColumnType = Union[ColumnCatalogEntry, FunctionIOCatalogEntry] class StatementBinderContext: @@ -93,7 +93,7 @@ def add_table_alias(self, alias: str, database_name: str, table_name: str): # Assemble columns. column_df = handler.get_columns(table_name).data table_obj = create_table_catalog_entry_for_data_source( - table_name, list(column_df["column_name"]) + table_name, column_df ) else: table_obj = self._catalog().get_table_catalog_entry(table_name) @@ -104,14 +104,14 @@ def add_derived_table_alias( self, alias: str, target_list: List[ - Union[TupleValueExpression, FunctionExpression, UdfIOCatalogEntry] + Union[TupleValueExpression, FunctionExpression, FunctionIOCatalogEntry] ], ): """ Add a alias -> derived table column mapping Arguments: alias (str): name of alias - target_list: list of TupleValueExpression or FunctionExpression or UdfIOCatalogEntry + target_list: list of TupleValueExpression or FunctionExpression or FunctionIOCatalogEntry """ self._check_duplicate_alias(alias) col_alias_map = {} diff --git a/evadb/catalog/catalog_manager.md b/evadb/catalog/catalog_manager.md index 106a34edac..22b54fac12 100644 --- a/evadb/catalog/catalog_manager.md +++ b/evadb/catalog/catalog_manager.md @@ -1,6 +1,6 @@ # Catalog Manager -CatalogManager class provides a set of services to interact with a database that stores metadata about tables, columns, and user-defined functions (UDFs). Information like what is the data type in a certain column in a table, type of a table, its name, etc.. It contains functions to get, insert and delete catalog entries for Tables, UDFs, UDF IOs, Columns and Indexes. +CatalogManager class provides a set of services to interact with a database that stores metadata about tables, columns, and user-defined functions. Information like what is the data type in a certain column in a table, type of a table, its name, etc.. It contains functions to get, insert and delete catalog entries for Tables, Functions, Function IOs, Columns and Indexes. This data is stored in the eva_catalog.db file which can be found in `evadb_data` folder. @@ -10,10 +10,10 @@ Catalog manager currently has 7 services in it: TableCatalogService() ColumnCatalogService() IndexCatalogService() -UdfCatalogService() -UdfIOCatalogService() -UdfCostCatalogService() -UdfMetadataCatalogService() +FunctionCatalogService() +FunctionIOCatalogService() +FunctionCostCatalogService() +FunctionMetadataCatalogService() ``` ## Catalog Services @@ -55,20 +55,20 @@ save_file_path: str type: VectorStoreType row_id: int feat_column_id: int -udf_signature: str +function_signature: str feat_column: ColumnCatalogEntry ``` -### UdfCatalog +### FunctionCatalog Fields: ``` name: str impl_file_path: str type: str row_id: int -args: List[UdfIOCatalogEntry] -outputs: List[UdfIOCatalogEntry] +args: List[FunctionIOCatalogEntry] +outputs: List[FunctionIOCatalogEntry] ``` -### UdfIOCatalog +### FunctionIOCatalog Fields: ``` name: str @@ -77,26 +77,26 @@ is_nullable: bool array_type: NdArrayType array_dimensions: Tuple[int] is_input: bool -udf_id: int -udf_name: str +function_id: int +function_name: str row_id: int ``` -### UdfCostCatalog +### FunctionCostCatalog Fields: ``` -udf_id: int +function_id: int name: str cost: float row_id: int ``` -### UdfMetadataCatalog +### FunctionMetadataCatalog Fields: ``` key: str value: str -udf_id: int -udf_name: str +function_id: int +function_name: str row_id: int ``` \ No newline at end of file diff --git a/evadb/catalog/catalog_manager.py b/evadb/catalog/catalog_manager.py index ac4c7a0e1d..83ec869001 100644 --- a/evadb/catalog/catalog_manager.py +++ b/evadb/catalog/catalog_manager.py @@ -24,7 +24,7 @@ ) from evadb.catalog.catalog_utils import ( cleanup_storage, - construct_udf_cache_catalog_entry, + construct_function_cache_catalog_entry, get_document_table_column_definitions, get_image_table_column_definitions, get_pdf_table_column_definitions, @@ -34,28 +34,32 @@ from evadb.catalog.models.utils import ( ColumnCatalogEntry, DatabaseCatalogEntry, + FunctionCacheCatalogEntry, + FunctionCatalogEntry, + FunctionCostCatalogEntry, + FunctionIOCatalogEntry, + FunctionMetadataCatalogEntry, IndexCatalogEntry, TableCatalogEntry, - UdfCacheCatalogEntry, - UdfCatalogEntry, - UdfCostCatalogEntry, - UdfIOCatalogEntry, - UdfMetadataCatalogEntry, drop_all_tables_except_catalog, init_db, truncate_catalog_tables, ) from evadb.catalog.services.column_catalog_service import ColumnCatalogService from evadb.catalog.services.database_catalog_service import DatabaseCatalogService +from evadb.catalog.services.function_cache_catalog_service import ( + FunctionCacheCatalogService, +) +from evadb.catalog.services.function_catalog_service import FunctionCatalogService +from evadb.catalog.services.function_cost_catalog_service import ( + FunctionCostCatalogService, +) +from evadb.catalog.services.function_io_catalog_service import FunctionIOCatalogService +from evadb.catalog.services.function_metadata_catalog_service import ( + FunctionMetadataCatalogService, +) from evadb.catalog.services.index_catalog_service import IndexCatalogService from evadb.catalog.services.table_catalog_service import TableCatalogService -from evadb.catalog.services.udf_cache_catalog_service import UdfCacheCatalogService -from evadb.catalog.services.udf_catalog_service import UdfCatalogService -from evadb.catalog.services.udf_cost_catalog_service import UdfCostCatalogService -from evadb.catalog.services.udf_io_catalog_service import UdfIOCatalogService -from evadb.catalog.services.udf_metadata_catalog_service import ( - UdfMetadataCatalogService, -) from evadb.catalog.sql_config import IDENTIFIER_COLUMN, SQLConfig from evadb.configuration.configuration_manager import ConfigurationManager from evadb.expression.function_expression import FunctionExpression @@ -75,12 +79,18 @@ def __init__(self, db_uri: str, config: ConfigurationManager): self._db_catalog_service = DatabaseCatalogService(self._sql_config.session) self._table_catalog_service = TableCatalogService(self._sql_config.session) self._column_service = ColumnCatalogService(self._sql_config.session) - self._udf_service = UdfCatalogService(self._sql_config.session) - self._udf_cost_catalog_service = UdfCostCatalogService(self._sql_config.session) - self._udf_io_service = UdfIOCatalogService(self._sql_config.session) - self._udf_metadata_service = UdfMetadataCatalogService(self._sql_config.session) + self._function_service = FunctionCatalogService(self._sql_config.session) + self._function_cost_catalog_service = FunctionCostCatalogService( + self._sql_config.session + ) + self._function_io_service = FunctionIOCatalogService(self._sql_config.session) + self._function_metadata_service = FunctionMetadataCatalogService( + self._sql_config.session + ) self._index_service = IndexCatalogService(self._sql_config.session) - self._udf_cache_service = UdfCacheCatalogService(self._sql_config.session) + self._function_cache_service = FunctionCacheCatalogService( + self._sql_config.session + ) @property def sql_config(self): @@ -277,90 +287,96 @@ def get_column_catalog_entries_by_table(self, table_obj: TableCatalogEntry): col_entries = self._column_service.filter_entries_by_table(table_obj) return col_entries - "udf catalog services" + "function catalog services" - def insert_udf_catalog_entry( + def insert_function_catalog_entry( self, name: str, impl_file_path: str, type: str, - udf_io_list: List[UdfIOCatalogEntry], - udf_metadata_list: List[UdfMetadataCatalogEntry], - ) -> UdfCatalogEntry: - """Inserts a UDF catalog entry along with UDF_IO entries. + function_io_list: List[FunctionIOCatalogEntry], + function_metadata_list: List[FunctionMetadataCatalogEntry], + ) -> FunctionCatalogEntry: + """Inserts a function catalog entry along with Function_IO entries. It persists the entry to the database. Arguments: - name(str): name of the udf - impl_file_path(str): implementation path of the udf - type(str): what kind of udf operator like classification, + name(str): name of the function + impl_file_path(str): implementation path of the function + type(str): what kind of function operator like classification, detection etc - udf_io_list(List[UdfIOCatalogEntry]): input/output udf info list + function_io_list(List[FunctionIOCatalogEntry]): input/output function info list Returns: - The persisted UdfCatalogEntry object. + The persisted FunctionCatalogEntry object. """ checksum = get_file_checksum(impl_file_path) - udf_entry = self._udf_service.insert_entry(name, impl_file_path, type, checksum) - for udf_io in udf_io_list: - udf_io.udf_id = udf_entry.row_id - self._udf_io_service.insert_entries(udf_io_list) - for udf_metadata in udf_metadata_list: - udf_metadata.udf_id = udf_entry.row_id - self._udf_metadata_service.insert_entries(udf_metadata_list) - return udf_entry - - def get_udf_catalog_entry_by_name(self, name: str) -> UdfCatalogEntry: + function_entry = self._function_service.insert_entry( + name, impl_file_path, type, checksum + ) + for function_io in function_io_list: + function_io.function_id = function_entry.row_id + self._function_io_service.insert_entries(function_io_list) + for function_metadata in function_metadata_list: + function_metadata.function_id = function_entry.row_id + self._function_metadata_service.insert_entries(function_metadata_list) + return function_entry + + def get_function_catalog_entry_by_name(self, name: str) -> FunctionCatalogEntry: """ - Get the UDF information based on name. + Get the function information based on name. Arguments: - name (str): name of the UDF + name (str): name of the function Returns: - UdfCatalogEntry object + FunctionCatalogEntry object """ - return self._udf_service.get_entry_by_name(name) + return self._function_service.get_entry_by_name(name) - def delete_udf_catalog_entry_by_name(self, udf_name: str) -> bool: - return self._udf_service.delete_entry_by_name(udf_name) + def delete_function_catalog_entry_by_name(self, function_name: str) -> bool: + return self._function_service.delete_entry_by_name(function_name) - def get_all_udf_catalog_entries(self): - return self._udf_service.get_all_entries() + def get_all_function_catalog_entries(self): + return self._function_service.get_all_entries() - "udf cost catalog services" + "function cost catalog services" - def upsert_udf_cost_catalog_entry( - self, udf_id: int, name: str, cost: int - ) -> UdfCostCatalogEntry: - """Upserts UDF cost catalog entry. + def upsert_function_cost_catalog_entry( + self, function_id: int, name: str, cost: int + ) -> FunctionCostCatalogEntry: + """Upserts function cost catalog entry. Arguments: - udf_id(int): unique udf id - name(str): the name of the udf - cost(int): cost of this UDF + function_id(int): unique function id + name(str): the name of the function + cost(int): cost of this function Returns: - The persisted UdfCostCatalogEntry object. + The persisted FunctionCostCatalogEntry object. """ - self._udf_cost_catalog_service.upsert_entry(udf_id, name, cost) + self._function_cost_catalog_service.upsert_entry(function_id, name, cost) - def get_udf_cost_catalog_entry(self, name: str): - return self._udf_cost_catalog_service.get_entry_by_name(name) + def get_function_cost_catalog_entry(self, name: str): + return self._function_cost_catalog_service.get_entry_by_name(name) - "UdfIO services" + "FunctionIO services" - def get_udf_io_catalog_input_entries( - self, udf_obj: UdfCatalogEntry - ) -> List[UdfIOCatalogEntry]: - return self._udf_io_service.get_input_entries_by_udf_id(udf_obj.row_id) + def get_function_io_catalog_input_entries( + self, function_obj: FunctionCatalogEntry + ) -> List[FunctionIOCatalogEntry]: + return self._function_io_service.get_input_entries_by_function_id( + function_obj.row_id + ) - def get_udf_io_catalog_output_entries( - self, udf_obj: UdfCatalogEntry - ) -> List[UdfIOCatalogEntry]: - return self._udf_io_service.get_output_entries_by_udf_id(udf_obj.row_id) + def get_function_io_catalog_output_entries( + self, function_obj: FunctionCatalogEntry + ) -> List[FunctionIOCatalogEntry]: + return self._function_io_service.get_output_entries_by_function_id( + function_obj.row_id + ) """ Index related services. """ @@ -370,21 +386,21 @@ def insert_index_catalog_entry( save_file_path: str, vector_store_type: VectorStoreType, feat_column: ColumnCatalogEntry, - udf_signature: str, + function_signature: str, ) -> IndexCatalogEntry: index_catalog_entry = self._index_service.insert_entry( - name, save_file_path, vector_store_type, feat_column, udf_signature + name, save_file_path, vector_store_type, feat_column, function_signature ) return index_catalog_entry def get_index_catalog_entry_by_name(self, name: str) -> IndexCatalogEntry: return self._index_service.get_entry_by_name(name) - def get_index_catalog_entry_by_column_and_udf_signature( - self, column: ColumnCatalogEntry, udf_signature: str + def get_index_catalog_entry_by_column_and_function_signature( + self, column: ColumnCatalogEntry, function_signature: str ): - return self._index_service.get_entry_by_column_and_udf_signature( - column, udf_signature + return self._index_service.get_entry_by_column_and_function_signature( + column, function_signature ) def drop_index_catalog_entry(self, index_name: str) -> bool: @@ -393,39 +409,45 @@ def drop_index_catalog_entry(self, index_name: str) -> bool: def get_all_index_catalog_entries(self): return self._index_service.get_all_entries() - """ Udf Cache related""" + """ Function Cache related""" - def insert_udf_cache_catalog_entry(self, func_expr: FunctionExpression): + def insert_function_cache_catalog_entry(self, func_expr: FunctionExpression): cache_dir = self._config.get_value("storage", "cache_dir") - entry = construct_udf_cache_catalog_entry(func_expr, cache_dir=cache_dir) - return self._udf_cache_service.insert_entry(entry) + entry = construct_function_cache_catalog_entry(func_expr, cache_dir=cache_dir) + return self._function_cache_service.insert_entry(entry) - def get_udf_cache_catalog_entry_by_name(self, name: str) -> UdfCacheCatalogEntry: - return self._udf_cache_service.get_entry_by_name(name) + def get_function_cache_catalog_entry_by_name( + self, name: str + ) -> FunctionCacheCatalogEntry: + return self._function_cache_service.get_entry_by_name(name) - def drop_udf_cache_catalog_entry(self, entry: UdfCacheCatalogEntry) -> bool: + def drop_function_cache_catalog_entry( + self, entry: FunctionCacheCatalogEntry + ) -> bool: # remove the data structure associated with the entry if entry: shutil.rmtree(entry.cache_path) - return self._udf_cache_service.delete_entry(entry) + return self._function_cache_service.delete_entry(entry) - """ UDF Metadata Catalog""" + """ function Metadata Catalog""" - def get_udf_metadata_entries_by_udf_name( - self, udf_name: str - ) -> List[UdfMetadataCatalogEntry]: + def get_function_metadata_entries_by_function_name( + self, function_name: str + ) -> List[FunctionMetadataCatalogEntry]: """ - Get the UDF metadata information for the provided udf. + Get the function metadata information for the provided function. Arguments: - udf_name (str): name of the UDF + function_name (str): name of the function Returns: - UdfMetadataCatalogEntry objects + FunctionMetadataCatalogEntry objects """ - udf_entry = self.get_udf_catalog_entry_by_name(udf_name) - if udf_entry: - entries = self._udf_metadata_service.get_entries_by_udf_id(udf_entry.row_id) + function_entry = self.get_function_catalog_entry_by_name(function_name) + if function_entry: + entries = self._function_metadata_service.get_entries_by_function_id( + function_entry.row_id + ) return entries else: return [] diff --git a/evadb/catalog/catalog_type.py b/evadb/catalog/catalog_type.py index eadaa50da5..35521016bb 100644 --- a/evadb/catalog/catalog_type.py +++ b/evadb/catalog/catalog_type.py @@ -44,6 +44,19 @@ class ColumnType(EvaDBEnum): NDARRAY # noqa: F821 ANY # noqa: F821 + @classmethod + def python_type_to_evadb_type(cls, dtype): + if dtype is str: + return cls.TEXT + elif dtype is int: + return cls.INTEGER + elif dtype is float: + return cls.FLOAT + elif dtype is bool: + return cls.BOOLEAN + else: + return cls.NDARRAY + class NdArrayType(EvaDBEnum): INT8 # noqa: F821 diff --git a/evadb/catalog/catalog_utils.py b/evadb/catalog/catalog_utils.py index 8be6648976..d2187978d2 100644 --- a/evadb/catalog/catalog_utils.py +++ b/evadb/catalog/catalog_utils.py @@ -27,27 +27,18 @@ ) from evadb.catalog.models.utils import ( ColumnCatalogEntry, + FunctionCacheCatalogEntry, + FunctionCatalogEntry, TableCatalogEntry, - UdfCacheCatalogEntry, - UdfCatalogEntry, ) from evadb.catalog.sql_config import IDENTIFIER_COLUMN from evadb.configuration.configuration_manager import ConfigurationManager -from evadb.executor.executor_utils import ExecutorError from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression from evadb.parser.create_statement import ColConstraintInfo, ColumnDefinition from evadb.utils.generic_utils import get_str_hash, remove_directory_contents -def generate_sqlalchemy_conn_str(engine: str, params: Dict[str, str]): - if engine == "postgres": - conn_str = f"""postgresql://{params["user"]}:{params["password"]}@{params["host"]}:{params["port"]}/{params["database"]}""" - else: - raise ExecutorError(f"Native engine: {engine} is not currently supported") - return conn_str - - def is_video_table(table: TableCatalogEntry): return table.table_type == TableType.VIDEO_DATA @@ -227,24 +218,24 @@ def xform_column_definitions_to_catalog_entries( return result_list -def construct_udf_cache_catalog_entry( +def construct_function_cache_catalog_entry( func_expr: FunctionExpression, cache_dir: str -) -> UdfCacheCatalogEntry: - """Constructs a udf cache catalog entry from a given function expression. +) -> FunctionCacheCatalogEntry: + """Constructs a function cache catalog entry from a given function expression. It is assumed that the function expression has already been bound using the binder. - The catalog entry is populated with dependent udfs and columns by traversing the + The catalog entry is populated with dependent functions and columns by traversing the expression tree. The cache name is represented by the signature of the function expression. Args: func_expr (FunctionExpression): the function expression with which the cache is associated cache_dir (str): path to store the cache Returns: - UdfCacheCatalogEntry: the udf cache catalog entry + FunctionCacheCatalogEntry: the function cache catalog entry """ - udf_depends = [] + function_depends = [] col_depends = [] for expr in func_expr.find_all(FunctionExpression): - udf_depends.append(expr.udf_obj.row_id) + function_depends.append(expr.function_obj.row_id) for expr in func_expr.find_all(TupleValueExpression): col_depends.append(expr.col_object.row_id) cache_name = func_expr.signature() @@ -253,12 +244,12 @@ def construct_udf_cache_catalog_entry( path = str(get_str_hash(cache_name + uuid.uuid4().hex)) cache_path = str(Path(cache_dir) / Path(f"{path}_{func_expr.name}")) args = tuple([arg.signature() for arg in func_expr.children]) - entry = UdfCacheCatalogEntry( + entry = FunctionCacheCatalogEntry( name=func_expr.signature(), - udf_id=func_expr.udf_obj.row_id, + function_id=func_expr.function_obj.row_id, cache_path=cache_path, args=args, - udf_depends=udf_depends, + function_depends=function_depends, col_depends=col_depends, ) @@ -272,14 +263,14 @@ def cleanup_storage(config): def get_metadata_entry_or_val( - udf_obj: UdfCatalogEntry, key: str, default_val: Any = None + function_obj: FunctionCatalogEntry, key: str, default_val: Any = None ) -> str: """ Return the metadata value for the given key, or the default value if the key is not found. Args: - udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is + function_obj (FunctionCatalogEntry): An object of type `FunctionCatalogEntry` which is used to extract metadata information. key (str): The metadata key for which the corresponding value needs to be retrieved. default_val (Any): The default value to be returned if the metadata key is not found. @@ -287,24 +278,24 @@ def get_metadata_entry_or_val( Returns: str: metadata value """ - for metadata in udf_obj.metadata: + for metadata in function_obj.metadata: if metadata.key == key: return metadata.value return default_val -def get_metadata_properties(udf_obj: UdfCatalogEntry) -> Dict: +def get_metadata_properties(function_obj: FunctionCatalogEntry) -> Dict: """ Return all the metadata properties as key value pair Args: - udf_obj (UdfCatalogEntry): An object of type `UdfCatalogEntry` which is + function_obj (FunctionCatalogEntry): An object of type `FunctionCatalogEntry` which is used to extract metadata information. Returns: Dict: key-value for each metadata entry """ properties = {} - for metadata in udf_obj.metadata: + for metadata in function_obj.metadata: properties[metadata.key] = metadata.value return properties diff --git a/evadb/catalog/models/association_models.py b/evadb/catalog/models/association_models.py index 0622ab4490..59f28196ee 100644 --- a/evadb/catalog/models/association_models.py +++ b/evadb/catalog/models/association_models.py @@ -16,21 +16,21 @@ from evadb.catalog.models.base_model import BaseModel -# dependency table to maintain a many-to-many relationship between udf_catalog and udf_cache_catalog. This is important to ensure that any changes to udf are propagated to udf_cache. For example, deletion of a udf should also clear the associated caches. +# dependency table to maintain a many-to-many relationship between function_catalog and function_cache_catalog. This is important to ensure that any changes to function are propagated to function_cache. For example, deletion of a function should also clear the associated caches. -depend_udf_and_udf_cache = Table( - "depend_udf_and_udf_cache", +depend_function_and_function_cache = Table( + "depend_function_and_function_cache", BaseModel.metadata, - Column("_udf_id", ForeignKey("udf_catalog._row_id")), - Column("_udf_cache_id", ForeignKey("udf_cache._row_id")), - UniqueConstraint("_udf_id", "_udf_cache_id"), + Column("_function_id", ForeignKey("function_catalog._row_id")), + Column("_function_cache_id", ForeignKey("function_cache._row_id")), + UniqueConstraint("_function_id", "_function_cache_id"), ) -depend_column_and_udf_cache = Table( - "depend_column_and_udf_cache", +depend_column_and_function_cache = Table( + "depend_column_and_function_cache", BaseModel.metadata, Column("_col_id", ForeignKey("column_catalog._row_id")), - Column("_udf_cache_id", ForeignKey("udf_cache._row_id")), - UniqueConstraint("_col_id", "_udf_cache_id"), + Column("_function_cache_id", ForeignKey("function_cache._row_id")), + UniqueConstraint("_col_id", "_function_cache_id"), ) diff --git a/evadb/catalog/models/column_catalog.py b/evadb/catalog/models/column_catalog.py index 6907d27f1a..646753eb3b 100644 --- a/evadb/catalog/models/column_catalog.py +++ b/evadb/catalog/models/column_catalog.py @@ -22,7 +22,7 @@ from sqlalchemy.types import Enum from evadb.catalog.catalog_type import ColumnType, Dimension, NdArrayType -from evadb.catalog.models.association_models import depend_column_and_udf_cache +from evadb.catalog.models.association_models import depend_column_and_function_cache from evadb.catalog.models.base_model import BaseModel from evadb.catalog.models.utils import ColumnCatalogEntry @@ -37,7 +37,7 @@ class ColumnCatalog(BaseModel): `_array_type:` the type of array, as specified in `NdArrayType` (or `None` if the column is a primitive type) `_array_dimensions:` the dimensions of the array (if `_array_type` is not `None`) `_table_id:` the `_row_id` of the `TableCatalog` entry to which the column belongs - `_dep_caches`: list of udf caches associated with the column + `_dep_caches`: list of function caches associated with the column """ __tablename__ = "column_catalog" @@ -54,10 +54,10 @@ class ColumnCatalog(BaseModel): # Foreign key dependency with the table catalog _table_catalog = relationship("TableCatalog", back_populates="_columns") - # list of associated UdfCacheCatalog entries + # list of associated FunctionCacheCatalog entries _dep_caches = relationship( - "UdfCacheCatalog", - secondary=depend_column_and_udf_cache, + "FunctionCacheCatalog", + secondary=depend_column_and_function_cache, back_populates="_col_depends", cascade="all, delete", ) diff --git a/evadb/catalog/models/udf_cache_catalog.py b/evadb/catalog/models/function_cache_catalog.py similarity index 59% rename from evadb/catalog/models/udf_cache_catalog.py rename to evadb/catalog/models/function_cache_catalog.py index 31746b24df..84d7df0b9e 100644 --- a/evadb/catalog/models/udf_cache_catalog.py +++ b/evadb/catalog/models/function_cache_catalog.py @@ -19,65 +19,67 @@ from sqlalchemy.orm import relationship from evadb.catalog.models.association_models import ( - depend_column_and_udf_cache, - depend_udf_and_udf_cache, + depend_column_and_function_cache, + depend_function_and_function_cache, ) from evadb.catalog.models.base_model import BaseModel -from evadb.catalog.models.utils import UdfCacheCatalogEntry +from evadb.catalog.models.utils import FunctionCacheCatalogEntry -class UdfCacheCatalog(BaseModel): - """The `UdfCacheCatalog` catalog stores information about the udf cache. +class FunctionCacheCatalog(BaseModel): + """The `FunctionCacheCatalog` catalog stores information about the function cache. It maintains the following information for each cache entry: `_row_id:` An autogenerated identifier for the cache entry. - `_name:` The name of the cache, also referred to as the unique UDF signature. - `_udf_id:` `_row_id` of the UDF in the `UdfCatalog` for which the cache is built. + `_name:` The name of the cache, also referred to as the unique function signature. + `_function_id:` `_row_id` of the function in the `FunctionCatalog` for which the cache is built. `_args:` A serialized list of `ColumnCatalog` `_row_id`s for each argument of the - UDF. If the argument is a function expression, it stores the string representation + Function. If the argument is a function expression, it stores the string representation of the expression tree. """ - __tablename__ = "udf_cache" + __tablename__ = "function_cache" _name = Column("name", String(128)) - _udf_id = Column( - "udf_id", Integer, ForeignKey("udf_catalog._row_id", ondelete="CASCADE") + _function_id = Column( + "function_id", + Integer, + ForeignKey("function_catalog._row_id", ondelete="CASCADE"), ) _cache_path = Column("cache_path", String(256)) _args = Column("args", String(1024)) - __table_args__ = (UniqueConstraint("name", "udf_id"), {}) + __table_args__ = (UniqueConstraint("name", "function_id"), {}) _col_depends = relationship( "ColumnCatalog", - secondary=depend_column_and_udf_cache, + secondary=depend_column_and_function_cache, back_populates="_dep_caches", # cascade="all, delete-orphan", ) - _udf_depends = relationship( - "UdfCatalog", - secondary=depend_udf_and_udf_cache, + _function_depends = relationship( + "FunctionCatalog", + secondary=depend_function_and_function_cache, back_populates="_dep_caches", # cascade="all, delete-orphan", ) - def __init__(self, name: str, udf_id: int, cache_path: str, args: Tuple[str]): + def __init__(self, name: str, function_id: int, cache_path: str, args: Tuple[str]): self._name = name - self._udf_id = udf_id + self._function_id = function_id self._cache_path = cache_path self._args = str(args) - def as_dataclass(self) -> "UdfCacheCatalogEntry": - udf_depends = [obj._row_id for obj in self._udf_depends] + def as_dataclass(self) -> "FunctionCacheCatalogEntry": + function_depends = [obj._row_id for obj in self._function_depends] col_depends = [obj._row_id for obj in self._col_depends] - return UdfCacheCatalogEntry( + return FunctionCacheCatalogEntry( row_id=self._row_id, name=self._name, - udf_id=self._udf_id, + function_id=self._function_id, cache_path=self._cache_path, args=literal_eval(self._args), - udf_depends=udf_depends, + function_depends=function_depends, col_depends=col_depends, ) diff --git a/evadb/catalog/models/udf_catalog.py b/evadb/catalog/models/function_catalog.py similarity index 68% rename from evadb/catalog/models/udf_catalog.py rename to evadb/catalog/models/function_catalog.py index 182ba2d7cc..721f5c4518 100644 --- a/evadb/catalog/models/udf_catalog.py +++ b/evadb/catalog/models/function_catalog.py @@ -17,39 +17,41 @@ from sqlalchemy import Column, String from sqlalchemy.orm import relationship -from evadb.catalog.models.association_models import depend_udf_and_udf_cache +from evadb.catalog.models.association_models import depend_function_and_function_cache from evadb.catalog.models.base_model import BaseModel -from evadb.catalog.models.utils import UdfCatalogEntry +from evadb.catalog.models.utils import FunctionCatalogEntry -class UdfCatalog(BaseModel): - """The `UdfCatalog` catalog stores information about the user-defined functions (UDFs) in the system. It maintains the following information for each UDF +class FunctionCatalog(BaseModel): + """The `FunctionCatalog` catalog stores information about the user-defined functions (Functions) in the system. It maintains the following information for each Function `_row_id:` an autogenerated identifier - `_impl_file_path: ` the path to the implementation script for the UDF - `_type:` an optional tag associated with the UDF (useful for grouping similar UDFs, such as multiple object detection UDFs) + `_impl_file_path: ` the path to the implementation script for the Function + `_type:` an optional tag associated with the function (useful for grouping similar Functions, such as multiple object detection Functions) """ - __tablename__ = "udf_catalog" + __tablename__ = "function_catalog" _name = Column("name", String(128), unique=True) _impl_file_path = Column("impl_file_path", String(128)) _type = Column("type", String(128)) _checksum = Column("checksum", String(512)) - # UdfIOCatalog storing the input/output attributes of the udf + # FunctionIOCatalog storing the input/output attributes of the function _attributes = relationship( - "UdfIOCatalog", back_populates="_udf", cascade="all, delete, delete-orphan" + "FunctionIOCatalog", + back_populates="_function", + cascade="all, delete, delete-orphan", ) _metadata = relationship( - "UdfMetadataCatalog", - back_populates="_udf", + "FunctionMetadataCatalog", + back_populates="_function", cascade="all, delete, delete-orphan", ) _dep_caches = relationship( - "UdfCacheCatalog", - secondary=depend_udf_and_udf_cache, - back_populates="_udf_depends", + "FunctionCacheCatalog", + secondary=depend_function_and_function_cache, + back_populates="_function_depends", cascade="all, delete", ) @@ -59,7 +61,7 @@ def __init__(self, name: str, impl_file_path: str, type: str, checksum: str): self._type = type self._checksum = checksum - def as_dataclass(self) -> "UdfCatalogEntry": + def as_dataclass(self) -> "FunctionCatalogEntry": args = [] outputs = [] for attribute in self._attributes: @@ -72,7 +74,7 @@ def as_dataclass(self) -> "UdfCatalogEntry": for meta_key_value in self._metadata: metadata.append(meta_key_value.as_dataclass()) - return UdfCatalogEntry( + return FunctionCatalogEntry( row_id=self._row_id, name=self._name, impl_file_path=self._impl_file_path, diff --git a/evadb/catalog/models/function_cost_catalog.py b/evadb/catalog/models/function_cost_catalog.py new file mode 100644 index 0000000000..486f5ee102 --- /dev/null +++ b/evadb/catalog/models/function_cost_catalog.py @@ -0,0 +1,53 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sqlalchemy import Column, Float, ForeignKey, Integer, String + +from evadb.catalog.models.base_model import BaseModel +from evadb.catalog.models.utils import FunctionCostCatalogEntry + + +class FunctionCostCatalog(BaseModel): + """The `FunctionCostCatalog` catalog stores information about the runtime of user-defined functions (Functions) in the system. It maintains the following information for each Function. + `_row_id:` an autogenerated unique identifier. + `_name:` name of the Function + `_function_id`: the row_id of the Function + `_cost:` cost of this Function + """ + + __tablename__ = "function_cost_catalog" + + _function_id = Column( + "function_id", + Integer, + ForeignKey("function_catalog._row_id", ondelete="CASCADE"), + ) + _function_name = Column( + "name", String(128), ForeignKey("function_catalog.name", ondelete="CASCADE") + ) + _cost = Column("cost", Float) + + def __init__(self, function_id: int, name: str, cost: float): + self._function_id = function_id + self._function_name = name + self._cost = cost + + def as_dataclass(self) -> "FunctionCostCatalogEntry": + return FunctionCostCatalogEntry( + function_id=self._function_id, + name=self._function_name, + cost=self._cost, + row_id=self._row_id, + ) diff --git a/evadb/catalog/models/udf_io_catalog.py b/evadb/catalog/models/function_io_catalog.py similarity index 75% rename from evadb/catalog/models/udf_io_catalog.py rename to evadb/catalog/models/function_io_catalog.py index b24f0d8552..1059e9807f 100644 --- a/evadb/catalog/models/udf_io_catalog.py +++ b/evadb/catalog/models/function_io_catalog.py @@ -21,12 +21,12 @@ from evadb.catalog.catalog_type import ColumnType, Dimension, NdArrayType from evadb.catalog.models.base_model import BaseModel -from evadb.catalog.models.utils import UdfIOCatalogEntry +from evadb.catalog.models.utils import FunctionIOCatalogEntry -class UdfIOCatalog(BaseModel): - """The `UdfIOCatalog` catalog stores information about the input and output - attributes of user-defined functions (UDFs). It maintains the following information +class FunctionIOCatalog(BaseModel): + """The `FunctionIOCatalog` catalog stores information about the input and output + attributes of user-defined functions (Functions). It maintains the following information for each attribute: `_row_id:` an autogenerated identifier `_name: ` name of the input/output argument @@ -34,10 +34,10 @@ class UdfIOCatalog(BaseModel): `_is_nullable:` which indicates whether it is nullable `_array_type:` the type of array, as specified in `NdArrayType` (or `None` if the attribute is a primitive type) `_array_dimensions:` the dimensions of the array (if `_array_type` is not `None`) - `_udf_id:` the `_row_id` of the `UdfCatalog` entry to which the attribute belongs + `_function_id:` the `_row_id` of the `FunctionCatalog` entry to which the attribute belongs """ - __tablename__ = "udfio_catalog" + __tablename__ = "functionio_catalog" _name = Column("name", String(100)) _type = Column("type", Enum(ColumnType), default=Enum) @@ -45,12 +45,14 @@ class UdfIOCatalog(BaseModel): _array_type = Column("array_type", Enum(NdArrayType), nullable=True) _array_dimensions = Column("array_dimensions", String(100)) _is_input = Column("is_input", Boolean, default=True) - _udf_id = Column("udf_id", Integer, ForeignKey("udf_catalog._row_id")) + _function_id = Column( + "function_id", Integer, ForeignKey("function_catalog._row_id") + ) - __table_args__ = (UniqueConstraint("name", "udf_id"), {}) + __table_args__ = (UniqueConstraint("name", "function_id"), {}) - # Foreign key dependency with the udf catalog - _udf = relationship("UdfCatalog", back_populates="_attributes") + # Foreign key dependency with the function catalog + _function = relationship("FunctionCatalog", back_populates="_attributes") def __init__( self, @@ -60,7 +62,7 @@ def __init__( array_type: NdArrayType = None, array_dimensions: Tuple[int] = None, is_input: bool = True, - udf_id: int = None, + function_id: int = None, ): self._name = name self._type = type @@ -68,7 +70,7 @@ def __init__( self._array_type = array_type self.array_dimensions = array_dimensions or str(()) self._is_input = is_input - self._udf_id = udf_id + self._function_id = function_id @property def array_dimensions(self): @@ -88,8 +90,8 @@ def array_dimensions(self, value: Tuple[int]): dimensions.append(dim) self._array_dimensions = str(tuple(dimensions)) - def as_dataclass(self) -> "UdfIOCatalogEntry": - return UdfIOCatalogEntry( + def as_dataclass(self) -> "FunctionIOCatalogEntry": + return FunctionIOCatalogEntry( row_id=self._row_id, name=self._name, type=self._type, @@ -97,6 +99,6 @@ def as_dataclass(self) -> "UdfIOCatalogEntry": array_type=self._array_type, array_dimensions=self.array_dimensions, is_input=self._is_input, - udf_id=self._udf_id, - udf_name=self._udf._name, + function_id=self._function_id, + function_name=self._function._name, ) diff --git a/evadb/catalog/models/udf_metadata_catalog.py b/evadb/catalog/models/function_metadata_catalog.py similarity index 56% rename from evadb/catalog/models/udf_metadata_catalog.py rename to evadb/catalog/models/function_metadata_catalog.py index e5364f4ddf..40bb005539 100644 --- a/evadb/catalog/models/udf_metadata_catalog.py +++ b/evadb/catalog/models/function_metadata_catalog.py @@ -17,41 +17,43 @@ from sqlalchemy.orm import relationship from evadb.catalog.models.base_model import BaseModel -from evadb.catalog.models.utils import UdfMetadataCatalogEntry +from evadb.catalog.models.utils import FunctionMetadataCatalogEntry -class UdfMetadataCatalog(BaseModel): +class FunctionMetadataCatalog(BaseModel): """ - The `UdfMetadataCatalog` catalog stores information about the metadata of user-defined functions (UDFs). - Metadata is implemented a key-value pair that can be used to store additional information about the UDF. + The `FunctionMetadataCatalog` catalog stores information about the metadata of user-defined functions (Functions). + Metadata is implemented a key-value pair that can be used to store additional information about the Function. It maintains the following information for each attribute: `_row_id:` an autogenerated identifier `_key: ` key/identifier of the metadata (as a string) `_value:` value of the metadata (as a string) - `_udf_id:` the `_row_id` of the `UdfCatalog` entry to which the attribute belongs + `_function_id:` the `_row_id` of the `FunctionCatalog` entry to which the attribute belongs """ - __tablename__ = "udf_metadata_catalog" + __tablename__ = "function_metadata_catalog" _key = Column("key", String(100)) _value = Column("value", String(100)) - _udf_id = Column("udf_id", Integer, ForeignKey("udf_catalog._row_id")) + _function_id = Column( + "function_id", Integer, ForeignKey("function_catalog._row_id") + ) - __table_args__ = (UniqueConstraint("key", "udf_id"), {}) + __table_args__ = (UniqueConstraint("key", "function_id"), {}) - # Foreign key dependency with the udf catalog - _udf = relationship("UdfCatalog", back_populates="_metadata") + # Foreign key dependency with the function catalog + _function = relationship("FunctionCatalog", back_populates="_metadata") - def __init__(self, key: str, value: str, udf_id: int): + def __init__(self, key: str, value: str, function_id: int): self._key = key self._value = value - self._udf_id = udf_id + self._function_id = function_id - def as_dataclass(self) -> "UdfMetadataCatalogEntry": - return UdfMetadataCatalogEntry( + def as_dataclass(self) -> "FunctionMetadataCatalogEntry": + return FunctionMetadataCatalogEntry( row_id=self._row_id, key=self._key, value=self._value, - udf_id=self._udf_id, - udf_name=self._udf._name, + function_id=self._function_id, + function_name=self._function._name, ) diff --git a/evadb/catalog/models/index_catalog.py b/evadb/catalog/models/index_catalog.py index 699e8a317d..f4a9ff00b2 100644 --- a/evadb/catalog/models/index_catalog.py +++ b/evadb/catalog/models/index_catalog.py @@ -29,8 +29,8 @@ class IndexCatalog(BaseModel): `_save_file_path:` the path to the index file on disk `_type:` the type of the index (refer to `VectorStoreType`) `_feat_column_id:` the `_row_id` of the `ColumnCatalog` entry for the column on which the index is built. - `_udf_signature:` if the index is created by running udf expression on input column, this will store - the udf signature of the used udf. Otherwise, this field is None. + `_function_signature:` if the index is created by running function expression on input column, this will store + the function signature of the used function. Otherwise, this field is None. """ __tablename__ = "index_catalog" @@ -41,7 +41,7 @@ class IndexCatalog(BaseModel): _feat_column_id = Column( "column_id", Integer, ForeignKey("column_catalog._row_id", ondelete="CASCADE") ) - _udf_signature = Column("udf", String, default=None) + _function_signature = Column("function", String, default=None) _feat_column = relationship( "ColumnCatalog", @@ -54,13 +54,13 @@ def __init__( save_file_path: str, type: VectorStoreType, feat_column_id: int = None, - udf_signature: str = None, + function_signature: str = None, ): self._name = name self._save_file_path = save_file_path self._type = type self._feat_column_id = feat_column_id - self._udf_signature = udf_signature + self._function_signature = function_signature def as_dataclass(self) -> "IndexCatalogEntry": feat_column = self._feat_column.as_dataclass() if self._feat_column else None @@ -70,6 +70,6 @@ def as_dataclass(self) -> "IndexCatalogEntry": save_file_path=self._save_file_path, type=self._type, feat_column_id=self._feat_column_id, - udf_signature=self._udf_signature, + function_signature=self._function_signature, feat_column=feat_column, ) diff --git a/evadb/catalog/models/udf_cost_catalog.py b/evadb/catalog/models/udf_cost_catalog.py deleted file mode 100644 index f57a81dcef..0000000000 --- a/evadb/catalog/models/udf_cost_catalog.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding=utf-8 -# Copyright 2018-2023 EvaDB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from sqlalchemy import Column, Float, ForeignKey, Integer, String - -from evadb.catalog.models.base_model import BaseModel -from evadb.catalog.models.utils import UdfCostCatalogEntry - - -class UdfCostCatalog(BaseModel): - """The `UdfCostCatalog` catalog stores information about the runtime of user-defined functions (UDFs) in the system. It maintains the following information for each UDF. - `_row_id:` an autogenerated unique identifier. - `_name:` name of the UDF - `_udf_id`: the row_id of the UDF - `_cost:` cost of this UDF - """ - - __tablename__ = "udf_cost_catalog" - - _udf_id = Column( - "udf_id", Integer, ForeignKey("udf_catalog._row_id", ondelete="CASCADE") - ) - _udf_name = Column( - "name", String(128), ForeignKey("udf_catalog.name", ondelete="CASCADE") - ) - _cost = Column("cost", Float) - - def __init__(self, udf_id: int, name: str, cost: float): - self._udf_id = udf_id - self._udf_name = name - self._cost = cost - - def as_dataclass(self) -> "UdfCostCatalogEntry": - return UdfCostCatalogEntry( - udf_id=self._udf_id, - name=self._udf_name, - cost=self._cost, - row_id=self._row_id, - ) diff --git a/evadb/catalog/models/utils.py b/evadb/catalog/models/utils.py index 2b92a8b4da..30773af12b 100644 --- a/evadb/catalog/models/utils.py +++ b/evadb/catalog/models/utils.py @@ -98,15 +98,15 @@ def drop_all_tables_except_catalog(engine: Engine): @dataclass(unsafe_hash=True) -class UdfCacheCatalogEntry: - """Dataclass representing an entry in the `UdfCatalog`.""" +class FunctionCacheCatalogEntry: + """Dataclass representing an entry in the `FunctionCatalog`.""" name: str - udf_id: int + function_id: int cache_path: str args: Tuple[str] row_id: int = None - udf_depends: Tuple[int] = field(compare=False, default_factory=tuple) + function_depends: Tuple[int] = field(compare=False, default_factory=tuple) col_depends: Tuple[int] = field(compare=False, default_factory=tuple) @@ -122,7 +122,9 @@ class ColumnCatalogEntry: table_id: int = None table_name: str = None row_id: int = None - dep_caches: List[UdfCacheCatalogEntry] = field(compare=False, default_factory=list) + dep_caches: List[FunctionCacheCatalogEntry] = field( + compare=False, default_factory=list + ) @dataclass(unsafe_hash=True) @@ -138,22 +140,22 @@ class TableCatalogEntry: @dataclass(unsafe_hash=True) -class UdfMetadataCatalogEntry: - """Class decouples the `UdfMetadataCatalog` from the sqlalchemy.""" +class FunctionMetadataCatalogEntry: + """Class decouples the `FunctionMetadataCatalog` from the sqlalchemy.""" key: str value: str - udf_id: int = None - udf_name: str = None + function_id: int = None + function_name: str = None row_id: int = None def display_format(self): - return f"{self.udf_name} - {self.key}: {self.value}" + return f"{self.function_name} - {self.key}: {self.value}" @dataclass(unsafe_hash=True) -class UdfIOCatalogEntry: - """Class decouples the `UdfIOCatalog` from the sqlalchemy.""" +class FunctionIOCatalogEntry: + """Class decouples the `FunctionIOCatalog` from the sqlalchemy.""" name: str type: ColumnType @@ -161,8 +163,8 @@ class UdfIOCatalogEntry: array_type: NdArrayType = None array_dimensions: Tuple[int] = None is_input: bool = True - udf_id: int = None - udf_name: str = None + function_id: int = None + function_name: str = None row_id: int = None def display_format(self): @@ -176,16 +178,16 @@ def display_format(self): @dataclass(unsafe_hash=True) -class UdfCostCatalogEntry: - """Dataclass representing an entry in the `UdfCostCatalog`.""" +class FunctionCostCatalogEntry: + """Dataclass representing an entry in the `FunctionCostCatalog`.""" name: str cost: float = None - udf_id: int = None + function_id: int = None row_id: int = None def display_format(self): - return {"udf_id": self.udf_id, "name": self.name, "cost": self.cost} + return {"function_id": self.function_id, "name": self.name, "cost": self.cost} @dataclass(unsafe_hash=True) @@ -197,13 +199,13 @@ class IndexCatalogEntry: type: VectorStoreType row_id: int = None feat_column_id: int = None - udf_signature: str = None + function_signature: str = None feat_column: ColumnCatalogEntry = None @dataclass(unsafe_hash=True) -class UdfCatalogEntry: - """Dataclass representing an entry in the `UdfCatalog`. +class FunctionCatalogEntry: + """Dataclass representing an entry in the `FunctionCatalog`. This is done to ensure we don't expose the sqlalchemy dependencies beyond catalog service. Further, sqlalchemy does not allow sharing of objects across threads. """ @@ -212,10 +214,14 @@ class UdfCatalogEntry: type: str checksum: str row_id: int = None - args: List[UdfIOCatalogEntry] = field(compare=False, default_factory=list) - outputs: List[UdfIOCatalogEntry] = field(compare=False, default_factory=list) - metadata: List[UdfMetadataCatalogEntry] = field(compare=False, default_factory=list) - dep_caches: List[UdfIOCatalogEntry] = field(compare=False, default_factory=list) + args: List[FunctionIOCatalogEntry] = field(compare=False, default_factory=list) + outputs: List[FunctionIOCatalogEntry] = field(compare=False, default_factory=list) + metadata: List[FunctionMetadataCatalogEntry] = field( + compare=False, default_factory=list + ) + dep_caches: List[FunctionIOCatalogEntry] = field( + compare=False, default_factory=list + ) def display_format(self): def _to_str(col): diff --git a/evadb/catalog/services/udf_cache_catalog_service.py b/evadb/catalog/services/function_cache_catalog_service.py similarity index 64% rename from evadb/catalog/services/udf_cache_catalog_service.py rename to evadb/catalog/services/function_cache_catalog_service.py index 8dc814cbff..6f6aec1d0f 100644 --- a/evadb/catalog/services/udf_cache_catalog_service.py +++ b/evadb/catalog/services/function_cache_catalog_service.py @@ -16,44 +16,48 @@ from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.sql.expression import select -from evadb.catalog.models.udf_cache_catalog import UdfCacheCatalog -from evadb.catalog.models.utils import UdfCacheCatalogEntry +from evadb.catalog.models.function_cache_catalog import FunctionCacheCatalog +from evadb.catalog.models.utils import FunctionCacheCatalogEntry from evadb.catalog.services.base_service import BaseService from evadb.catalog.services.column_catalog_service import ColumnCatalogService -from evadb.catalog.services.udf_catalog_service import UdfCatalogService +from evadb.catalog.services.function_catalog_service import FunctionCatalogService from evadb.utils.errors import CatalogError from evadb.utils.logging_manager import logger -class UdfCacheCatalogService(BaseService): +class FunctionCacheCatalogService(BaseService): def __init__(self, db_session: Session): - super().__init__(UdfCacheCatalog, db_session) + super().__init__(FunctionCacheCatalog, db_session) self._column_service: ColumnCatalogService = ColumnCatalogService(db_session) - self._udf_service: UdfCatalogService = UdfCatalogService(db_session) + self._function_service: FunctionCatalogService = FunctionCatalogService( + db_session + ) - def insert_entry(self, entry: UdfCacheCatalogEntry) -> UdfCacheCatalogEntry: - """Insert a new udf cache entry into udf cache catalog. + def insert_entry( + self, entry: FunctionCacheCatalogEntry + ) -> FunctionCacheCatalogEntry: + """Insert a new function cache entry into function cache catalog. Arguments: `name` (str): name of the cache table - `udf_id` (int): `row_id` of the UDF on which the cache is built + `function_id` (int): `row_id` of the function on which the cache is built `cache_path` (str): path of the cache table - `args` (List[Any]): arguments of the UDF whose output is being cached - `udf_depends` (List[UdfCatalogEntry]): dependent UDF entries + `args` (List[Any]): arguments of the function whose output is being cached + `function_depends` (List[FunctionCatalogEntry]): dependent function entries `col_depends` (List[ColumnCatalogEntry]): dependent column entries Returns: - `UdfCacheCatalogEntry` + `FunctionCacheCatalogEntry` """ try: cache_obj = self.model( name=entry.name, - udf_id=entry.udf_id, + function_id=entry.function_id, cache_path=entry.cache_path, args=entry.args, ) - cache_obj._udf_depends = [ - self._udf_service.get_entry_by_id(udf_id, return_alchemy=True) - for udf_id in entry.udf_depends + cache_obj._function_depends = [ + self._function_service.get_entry_by_id(function_id, return_alchemy=True) + for function_id in entry.function_depends ] cache_obj._col_depends = [ self._column_service.get_entry_by_id(col_id, return_alchemy=True) @@ -62,15 +66,13 @@ def insert_entry(self, entry: UdfCacheCatalogEntry) -> UdfCacheCatalogEntry: cache_obj = cache_obj.save(self.session) except Exception as e: - err_msg = ( - f"Failed to insert entry into udf cache catalog with exception {str(e)}" - ) + err_msg = f"Failed to insert entry into function cache catalog with exception {str(e)}" logger.exception(err_msg) raise CatalogError(err_msg) else: return cache_obj.as_dataclass() - def get_entry_by_name(self, name: str) -> UdfCacheCatalogEntry: + def get_entry_by_name(self, name: str) -> FunctionCacheCatalogEntry: try: entry = self.session.execute( select(self.model).filter(self.model._name == name) @@ -79,10 +81,10 @@ def get_entry_by_name(self, name: str) -> UdfCacheCatalogEntry: except NoResultFound: return None - def delete_entry(self, cache: UdfCacheCatalogEntry): + def delete_entry(self, cache: FunctionCacheCatalogEntry): """Delete cache table from the db Arguments: - cache (UdfCacheCatalogEntry): cache to delete + cache (FunctionCacheCatalogEntry): cache to delete Returns: True if successfully removed else false """ diff --git a/evadb/catalog/services/udf_catalog_service.py b/evadb/catalog/services/function_catalog_service.py similarity index 52% rename from evadb/catalog/services/udf_catalog_service.py rename to evadb/catalog/services/function_catalog_service.py index 77310ce5fb..d8c449d1c9 100644 --- a/evadb/catalog/services/udf_catalog_service.py +++ b/evadb/catalog/services/function_catalog_service.py @@ -15,78 +15,80 @@ from sqlalchemy.orm import Session from sqlalchemy.sql.expression import select -from evadb.catalog.models.udf_catalog import UdfCatalog, UdfCatalogEntry +from evadb.catalog.models.function_catalog import FunctionCatalog, FunctionCatalogEntry from evadb.catalog.services.base_service import BaseService from evadb.utils.logging_manager import logger -class UdfCatalogService(BaseService): +class FunctionCatalogService(BaseService): def __init__(self, db_session: Session): - super().__init__(UdfCatalog, db_session) + super().__init__(FunctionCatalog, db_session) def insert_entry( self, name: str, impl_path: str, type: str, checksum: str - ) -> UdfCatalogEntry: - """Insert a new udf entry + ) -> FunctionCatalogEntry: + """Insert a new function entry Arguments: - name (str): name of the udf - impl_path (str): path to the udf implementation relative to evadb/udf - type (str): udf operator kind, classification or detection or etc - checksum(str): checksum of the udf file content, used for consistency + name (str): name of the function + impl_path (str): path to the function implementation relative to evadb/function + type (str): function operator kind, classification or detection or etc + checksum(str): checksum of the function file content, used for consistency Returns: - UdfCatalogEntry: Returns the new entry created + FunctionCatalogEntry: Returns the new entry created """ - udf_obj = self.model(name, impl_path, type, checksum) - udf_obj = udf_obj.save(self.session) - return udf_obj.as_dataclass() + function_obj = self.model(name, impl_path, type, checksum) + function_obj = function_obj.save(self.session) + return function_obj.as_dataclass() - def get_entry_by_name(self, name: str) -> UdfCatalogEntry: - """return the udf entry that matches the name provided. + def get_entry_by_name(self, name: str) -> FunctionCatalogEntry: + """return the function entry that matches the name provided. None if no such entry found. Arguments: name (str): name to be searched """ - udf_obj = self.session.execute( + function_obj = self.session.execute( select(self.model).filter(self.model._name == name) ).scalar_one_or_none() - if udf_obj: - return udf_obj.as_dataclass() + if function_obj: + return function_obj.as_dataclass() return None - def get_entry_by_id(self, id: int, return_alchemy=False) -> UdfCatalogEntry: - """return the udf entry that matches the id provided. + def get_entry_by_id(self, id: int, return_alchemy=False) -> FunctionCatalogEntry: + """return the function entry that matches the id provided. None if no such entry found. Arguments: id (int): id to be searched """ - udf_obj = self.session.execute( + function_obj = self.session.execute( select(self.model).filter(self.model._row_id == id) ).scalar_one_or_none() - if udf_obj: - return udf_obj if return_alchemy else udf_obj.as_dataclass() - return udf_obj + if function_obj: + return function_obj if return_alchemy else function_obj.as_dataclass() + return function_obj def delete_entry_by_name(self, name: str): - """Delete a udf entry from the catalog UdfCatalog + """Delete a function entry from the catalog FunctionCatalog Arguments: - name (str): udf name to be deleted + name (str): function name to be deleted Returns: True if successfully deleted else True """ try: - udf_obj = self.session.execute( + function_obj = self.session.execute( select(self.model).filter(self.model._name == name) ).scalar_one() - udf_obj.delete(self.session) + function_obj.delete(self.session) except Exception as e: - logger.exception(f"Delete udf failed for name {name} with error {str(e)}") + logger.exception( + f"Delete function failed for name {name} with error {str(e)}" + ) return False return True diff --git a/evadb/catalog/services/function_cost_catalog_service.py b/evadb/catalog/services/function_cost_catalog_service.py new file mode 100644 index 0000000000..ac84e8c948 --- /dev/null +++ b/evadb/catalog/services/function_cost_catalog_service.py @@ -0,0 +1,91 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sqlalchemy.orm import Session +from sqlalchemy.sql.expression import select + +from evadb.catalog.models.function_cost_catalog import ( + FunctionCostCatalog, + FunctionCostCatalogEntry, +) +from evadb.catalog.services.base_service import BaseService +from evadb.utils.errors import CatalogError + + +class FunctionCostCatalogService(BaseService): + def __init__(self, db_session: Session): + super().__init__(FunctionCostCatalog, db_session) + + def insert_entry( + self, function_id: int, name: str, cost: int + ) -> FunctionCostCatalogEntry: + """Insert a new function cost entry + + Arguments: + function_id(int): id of the function + name (str) : name of the function + cost(int) : cost of the function + + Returns: + FunctionCostCatalogEntry: Returns the new entry created + """ + try: + function_obj = self.model(function_id, name, cost) + function_obj.save(self.session) + except Exception as e: + raise CatalogError( + f"Error while inserting entry to FunctionCostCatalog: {str(e)}" + ) + + def upsert_entry(self, function_id: int, name: str, new_cost: int): + """Upserts a new function cost entry + + Arguments: + function_id(int): id of the function + name (str) : name of the function + cost(int) : cost of the function + """ + try: + function_obj = self.session.execute( + select(self.model).filter(self.model._function_id == function_id) + ).scalar_one_or_none() + if function_obj: + function_obj.update(self.session, cost=new_cost) + else: + self.insert_entry(function_id, name, new_cost) + except Exception as e: + raise CatalogError( + f"Error while upserting entry to FunctionCostCatalog: {str(e)}" + ) + + def get_entry_by_name(self, name: str) -> FunctionCostCatalogEntry: + """return the function cost entry that matches the name provided. + None if no such entry found. + + Arguments: + name (str): name to be searched + """ + + try: + function_obj = self.session.execute( + select(self.model).filter(self.model._function_name == name) + ).scalar_one_or_none() + if function_obj: + return function_obj.as_dataclass() + return None + except Exception as e: + raise CatalogError( + f"Error while getting entry for function {name} from FunctionCostCatalog: {str(e)}" + ) diff --git a/evadb/catalog/services/udf_io_catalog_service.py b/evadb/catalog/services/function_io_catalog_service.py similarity index 67% rename from evadb/catalog/services/udf_io_catalog_service.py rename to evadb/catalog/services/function_io_catalog_service.py index 60d51d898a..290f3d20e6 100644 --- a/evadb/catalog/services/udf_io_catalog_service.py +++ b/evadb/catalog/services/function_io_catalog_service.py @@ -17,21 +17,26 @@ from sqlalchemy.orm import Session from sqlalchemy.sql.expression import select -from evadb.catalog.models.udf_io_catalog import UdfIOCatalog, UdfIOCatalogEntry +from evadb.catalog.models.function_io_catalog import ( + FunctionIOCatalog, + FunctionIOCatalogEntry, +) from evadb.catalog.services.base_service import BaseService from evadb.utils.logging_manager import logger -class UdfIOCatalogService(BaseService): +class FunctionIOCatalogService(BaseService): def __init__(self, db_session: Session): - super().__init__(UdfIOCatalog, db_session) + super().__init__(FunctionIOCatalog, db_session) - def get_input_entries_by_udf_id(self, udf_id: int) -> List[UdfIOCatalogEntry]: + def get_input_entries_by_function_id( + self, function_id: int + ) -> List[FunctionIOCatalogEntry]: try: result = ( self.session.execute( select(self.model).filter( - self.model._udf_id == udf_id, + self.model._function_id == function_id, self.model._is_input == True, # noqa ) ) @@ -40,16 +45,18 @@ def get_input_entries_by_udf_id(self, udf_id: int) -> List[UdfIOCatalogEntry]: ) return [obj.as_dataclass() for obj in result] except Exception as e: - error = f"Getting inputs for UDF id {udf_id} raised {e}" + error = f"Getting inputs for function id {function_id} raised {e}" logger.error(error) raise RuntimeError(error) - def get_output_entries_by_udf_id(self, udf_id: int) -> List[UdfIOCatalogEntry]: + def get_output_entries_by_function_id( + self, function_id: int + ) -> List[FunctionIOCatalogEntry]: try: result = ( self.session.execute( select(self.model).filter( - self.model._udf_id == udf_id, + self.model._function_id == function_id, self.model._is_input == False, # noqa ) ) @@ -58,25 +65,25 @@ def get_output_entries_by_udf_id(self, udf_id: int) -> List[UdfIOCatalogEntry]: ) return [obj.as_dataclass() for obj in result] except Exception as e: - error = f"Getting outputs for UDF id {udf_id} raised {e}" + error = f"Getting outputs for function id {function_id} raised {e}" logger.error(error) raise RuntimeError(error) - def insert_entries(self, io_list: List[UdfIOCatalogEntry]): - """Commit entries to the udf_io table + def insert_entries(self, io_list: List[FunctionIOCatalogEntry]): + """Commit entries to the function_io table Arguments: - io_list (List[UdfIOCatalogEntry]): List of io info io be added + io_list (List[FunctionIOCatalogEntry]): List of io info io be added """ for io in io_list: - io_obj = UdfIOCatalog( + io_obj = FunctionIOCatalog( name=io.name, type=io.type, is_nullable=io.is_nullable, array_type=io.array_type, array_dimensions=io.array_dimensions, is_input=io.is_input, - udf_id=io.udf_id, + function_id=io.function_id, ) io_obj.save(self.session) diff --git a/evadb/catalog/services/udf_metadata_catalog_service.py b/evadb/catalog/services/function_metadata_catalog_service.py similarity index 64% rename from evadb/catalog/services/udf_metadata_catalog_service.py rename to evadb/catalog/services/function_metadata_catalog_service.py index 26a501d71e..e302ea41ef 100644 --- a/evadb/catalog/services/udf_metadata_catalog_service.py +++ b/evadb/catalog/services/function_metadata_catalog_service.py @@ -17,38 +17,40 @@ from sqlalchemy.orm import Session from sqlalchemy.sql.expression import select -from evadb.catalog.models.udf_metadata_catalog import ( - UdfMetadataCatalog, - UdfMetadataCatalogEntry, +from evadb.catalog.models.function_metadata_catalog import ( + FunctionMetadataCatalog, + FunctionMetadataCatalogEntry, ) from evadb.catalog.services.base_service import BaseService from evadb.utils.errors import CatalogError from evadb.utils.logging_manager import logger -class UdfMetadataCatalogService(BaseService): +class FunctionMetadataCatalogService(BaseService): def __init__(self, db_session: Session): - super().__init__(UdfMetadataCatalog, db_session) + super().__init__(FunctionMetadataCatalog, db_session) - def insert_entries(self, entries: List[UdfMetadataCatalogEntry]): + def insert_entries(self, entries: List[FunctionMetadataCatalogEntry]): try: for entry in entries: - metadata_obj = UdfMetadataCatalog( - key=entry.key, value=entry.value, udf_id=entry.udf_id + metadata_obj = FunctionMetadataCatalog( + key=entry.key, value=entry.value, function_id=entry.function_id ) metadata_obj.save(self.session) except Exception as e: logger.exception( - f"Failed to insert entry {entry} into udf metadata catalog with exception {str(e)}" + f"Failed to insert entry {entry} into function metadata catalog with exception {str(e)}" ) raise CatalogError(e) - def get_entries_by_udf_id(self, udf_id: int) -> List[UdfMetadataCatalogEntry]: + def get_entries_by_function_id( + self, function_id: int + ) -> List[FunctionMetadataCatalogEntry]: try: result = ( self.session.execute( select(self.model).filter( - self.model._udf_id == udf_id, + self.model._function_id == function_id, ) ) .scalars() @@ -56,6 +58,6 @@ def get_entries_by_udf_id(self, udf_id: int) -> List[UdfMetadataCatalogEntry]: ) return [obj.as_dataclass() for obj in result] except Exception as e: - error = f"Getting metadata entries for UDF id {udf_id} raised {e}" + error = f"Getting metadata entries for function id {function_id} raised {e}" logger.error(error) raise CatalogError(error) diff --git a/evadb/catalog/services/index_catalog_service.py b/evadb/catalog/services/index_catalog_service.py index ea41bd7225..852bcd9f07 100644 --- a/evadb/catalog/services/index_catalog_service.py +++ b/evadb/catalog/services/index_catalog_service.py @@ -34,10 +34,10 @@ def insert_entry( save_file_path: str, type: VectorStoreType, feat_column: ColumnCatalogEntry, - udf_signature: str, + function_signature: str, ) -> IndexCatalogEntry: index_entry = IndexCatalog( - name, save_file_path, type, feat_column.row_id, udf_signature + name, save_file_path, type, feat_column.row_id, function_signature ) index_entry = index_entry.save(self.session) return index_entry.as_dataclass() @@ -56,13 +56,13 @@ def get_entry_by_id(self, id: int) -> IndexCatalogEntry: except NoResultFound: return None - def get_entry_by_column_and_udf_signature( - self, column: ColumnCatalogEntry, udf_signature: str + def get_entry_by_column_and_function_signature( + self, column: ColumnCatalogEntry, function_signature: str ): try: entry = self.query.filter( self.model._feat_column_id == column.row_id, - self.model._udf_signature == udf_signature, + self.model._function_signature == function_signature, ).one() return entry.as_dataclass() except NoResultFound: diff --git a/evadb/catalog/services/udf_cost_catalog_service.py b/evadb/catalog/services/udf_cost_catalog_service.py deleted file mode 100644 index 284a2b4efa..0000000000 --- a/evadb/catalog/services/udf_cost_catalog_service.py +++ /dev/null @@ -1,86 +0,0 @@ -# coding=utf-8 -# Copyright 2018-2023 EvaDB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from sqlalchemy.orm import Session -from sqlalchemy.sql.expression import select - -from evadb.catalog.models.udf_cost_catalog import UdfCostCatalog, UdfCostCatalogEntry -from evadb.catalog.services.base_service import BaseService -from evadb.utils.errors import CatalogError - - -class UdfCostCatalogService(BaseService): - def __init__(self, db_session: Session): - super().__init__(UdfCostCatalog, db_session) - - def insert_entry(self, udf_id: int, name: str, cost: int) -> UdfCostCatalogEntry: - """Insert a new udf cost entry - - Arguments: - udf_id(int): id of the udf - name (str) : name of the udf - cost(int) : cost of the udf - - Returns: - UdfCostCatalogEntry: Returns the new entry created - """ - try: - udf_obj = self.model(udf_id, name, cost) - udf_obj.save(self.session) - except Exception as e: - raise CatalogError( - f"Error while inserting entry to UdfCostCatalog: {str(e)}" - ) - - def upsert_entry(self, udf_id: int, name: str, new_cost: int): - """Upserts a new udf cost entry - - Arguments: - udf_id(int): id of the udf - name (str) : name of the udf - cost(int) : cost of the udf - """ - try: - udf_obj = self.session.execute( - select(self.model).filter(self.model._udf_id == udf_id) - ).scalar_one_or_none() - if udf_obj: - udf_obj.update(self.session, cost=new_cost) - else: - self.insert_entry(udf_id, name, new_cost) - except Exception as e: - raise CatalogError( - f"Error while upserting entry to UdfCostCatalog: {str(e)}" - ) - - def get_entry_by_name(self, name: str) -> UdfCostCatalogEntry: - """return the udf cost entry that matches the name provided. - None if no such entry found. - - Arguments: - name (str): name to be searched - """ - - try: - udf_obj = self.session.execute( - select(self.model).filter(self.model._udf_name == name) - ).scalar_one_or_none() - if udf_obj: - return udf_obj.as_dataclass() - return None - except Exception as e: - raise CatalogError( - f"Error while getting entry for udf {name} from UdfCostCatalog: {str(e)}" - ) diff --git a/evadb/catalog/sql_config.py b/evadb/catalog/sql_config.py index 616cee2436..efba395b8f 100644 --- a/evadb/catalog/sql_config.py +++ b/evadb/catalog/sql_config.py @@ -27,14 +27,14 @@ "column_catalog", "table_catalog", "database_catalog", - "depend_column_and_udf_cache", - "udf_cache", - "udf_catalog", - "depend_udf_and_udf_cache", + "depend_column_and_function_cache", + "function_cache", + "function_catalog", + "depend_function_and_function_cache", "index_catalog", - "udfio_catalog", - "udf_cost_catalog", - "udf_metadata_catalog", + "functionio_catalog", + "function_cost_catalog", + "function_metadata_catalog", ] diff --git a/evadb/configuration/bootstrap_environment.py b/evadb/configuration/bootstrap_environment.py index 4014f0f899..55e2f0f228 100644 --- a/evadb/configuration/bootstrap_environment.py +++ b/evadb/configuration/bootstrap_environment.py @@ -22,11 +22,11 @@ from evadb.configuration.constants import ( CACHE_DIR, DB_DEFAULT_NAME, + FUNCTION_DIR, INDEX_DIR, MODEL_DIR, S3_DOWNLOAD_DIR, TMP_DIR, - UDF_DIR, EvaDB_CONFIG_FILE, EvaDB_DATASET_DIR, ) @@ -103,7 +103,7 @@ def create_directories_and_get_default_config_values( cache_dir = evadb_dir / CACHE_DIR s3_dir = evadb_dir / S3_DOWNLOAD_DIR tmp_dir = evadb_dir / TMP_DIR - udf_dir = evadb_dir / UDF_DIR + function_dir = evadb_dir / FUNCTION_DIR model_dir = evadb_dir / MODEL_DIR if not evadb_dir.exists(): @@ -118,8 +118,8 @@ def create_directories_and_get_default_config_values( s3_dir.mkdir(parents=True, exist_ok=True) if not tmp_dir.exists(): tmp_dir.mkdir(parents=True, exist_ok=True) - if not udf_dir.exists(): - udf_dir.mkdir(parents=True, exist_ok=True) + if not function_dir.exists(): + function_dir.mkdir(parents=True, exist_ok=True) if not model_dir.exists(): model_dir.mkdir(parents=True, exist_ok=True) @@ -133,7 +133,7 @@ def create_directories_and_get_default_config_values( config_obj["storage"]["cache_dir"] = str(cache_dir.resolve()) config_obj["storage"]["s3_download_dir"] = str(s3_dir.resolve()) config_obj["storage"]["tmp_dir"] = str(tmp_dir.resolve()) - config_obj["storage"]["udf_dir"] = str(udf_dir.resolve()) + config_obj["storage"]["function_dir"] = str(function_dir.resolve()) config_obj["storage"]["model_dir"] = str(model_dir.resolve()) if category and key: return config_obj.get(category, {}).get(key, None) diff --git a/evadb/configuration/constants.py b/evadb/configuration/constants.py index 7e5659283b..51513462d6 100644 --- a/evadb/configuration/constants.py +++ b/evadb/configuration/constants.py @@ -22,7 +22,7 @@ EvaDB_APPS_DIR = "apps" EvaDB_DATASET_DIR = "evadb_datasets" EvaDB_CONFIG_FILE = "evadb.yml" -UDF_DIR = "udfs" +FUNCTION_DIR = "functions" MODEL_DIR = "models" CATALOG_DIR = "catalog" INDEX_DIR = "index" diff --git a/evadb/constants.py b/evadb/constants.py index 7c8a0a41c4..80777c5ab4 100644 --- a/evadb/constants.py +++ b/evadb/constants.py @@ -16,8 +16,8 @@ CONTINUOUS = 1 NO_GPU = -1 UNDEFINED_GROUP_ID = -1 -# remove this when we implement the cacheable logic in the UDF itself -CACHEABLE_UDFS = ["Yolo", "FaceDetector", "OCRExtractor", "HFObjectDetector"] +# remove this when we implement the cacheable logic in the function itself +CACHEABLE_FUNCTIONS = ["Yolo", "FaceDetector", "OCRExtractor", "HFObjectDetector"] IFRAMES = "IFRAMES" AUDIORATE = "AUDIORATE" DEFAULT_FUNCTION_EXPRESSION_COST = 100 diff --git a/evadb/executor/apply_and_merge_executor.py b/evadb/executor/apply_and_merge_executor.py index a9d0131393..2fca7fd14e 100644 --- a/evadb/executor/apply_and_merge_executor.py +++ b/evadb/executor/apply_and_merge_executor.py @@ -43,10 +43,12 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]: func_result = self.func_expr.evaluate(batch) # persist stats of function expression - if self.func_expr.udf_obj and self.func_expr._stats: - udf_id = self.func_expr.udf_obj.row_id - self.catalog().upsert_udf_cost_catalog_entry( - udf_id, self.func_expr.udf_obj.name, self.func_expr._stats.prev_cost + if self.func_expr.function_obj and self.func_expr._stats: + function_id = self.func_expr.function_obj.row_id + self.catalog().upsert_function_cost_catalog_entry( + function_id, + self.func_expr.function_obj.name, + self.func_expr._stats.prev_cost, ) output = Batch.merge_column_wise([batch, func_result]) diff --git a/evadb/executor/create_database_executor.py b/evadb/executor/create_database_executor.py index a0b5aec751..bd1f0efd7b 100644 --- a/evadb/executor/create_database_executor.py +++ b/evadb/executor/create_database_executor.py @@ -16,8 +16,10 @@ from evadb.database import EvaDBDatabase from evadb.executor.abstract_executor import AbstractExecutor +from evadb.executor.executor_utils import ExecutorError from evadb.models.storage.batch import Batch from evadb.parser.create_statement import CreateDatabaseStatement +from evadb.third_party.databases.interface import get_database_handler from evadb.utils.logging_manager import logger @@ -26,15 +28,26 @@ def __init__(self, db: EvaDBDatabase, node: CreateDatabaseStatement): super().__init__(db, node) def exec(self, *args, **kwargs): - # todo handle if_not_exists + # TODO: handle if_not_exists logger.debug( f"Trying to connect to the provided engine {self.node.engine} with params {self.node.param_dict}" ) - # todo handle if the provided database params are valid - logger.debug(f"Creating database {self.node}") + # Check if database already exists. + db_catalog_entry = self.catalog().get_database_catalog_entry( + self.node.database_name + ) + if db_catalog_entry is not None: + raise ExecutorError(f"{self.node.database_name} already exists.") + # Check the validity of database entry. + handler = get_database_handler(self.node.engine, **self.node.param_dict) + resp = handler.connect() + if not resp.status: + raise ExecutorError(f"Cannot establish connection due to {resp.error}") + + logger.debug(f"Creating database {self.node}") self.catalog().insert_database_catalog_entry( self.node.database_name, self.node.engine, self.node.param_dict ) diff --git a/evadb/executor/create_executor.py b/evadb/executor/create_executor.py index d16879f0a5..39541d971c 100644 --- a/evadb/executor/create_executor.py +++ b/evadb/executor/create_executor.py @@ -28,23 +28,30 @@ def exec(self, *args, **kwargs): if not handle_if_not_exists( self.catalog(), self.node.table_info, self.node.if_not_exists ): + create_table_done = False logger.debug(f"Creating table {self.node.table_info}") catalog_entry = self.catalog().create_and_insert_table_catalog_entry( self.node.table_info, self.node.column_list ) storage_engine = StorageEngine.factory(self.db, catalog_entry) - storage_engine.create(table=catalog_entry) + try: + storage_engine.create(table=catalog_entry) + create_table_done = True + if self.children != []: + assert ( + len(self.children) == 1 + ), "Create table from query expects 1 child, finds {}".format( + len(self.children) + ) + child = self.children[0] - if self.children != []: - assert ( - len(self.children) == 1 - ), "Create table from query expects 1 child, finds {}".format( - len(self.children) - ) - child = self.children[0] - - # Populate the table - for batch in child.exec(): - batch.drop_column_alias() - storage_engine.write(catalog_entry, batch) + # Populate the table + for batch in child.exec(): + batch.drop_column_alias() + storage_engine.write(catalog_entry, batch) + except Exception as e: + # rollback if the create call fails + if create_table_done: + storage_engine.drop(catalog_entry) + raise e diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py new file mode 100644 index 0000000000..a7b0acd477 --- /dev/null +++ b/evadb/executor/create_function_executor.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Dict, List + +import pandas as pd + +from evadb.catalog.catalog_utils import get_metadata_properties +from evadb.catalog.models.function_catalog import FunctionCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry +from evadb.configuration.constants import ( + DEFAULT_TRAIN_TIME_LIMIT, + EvaDB_INSTALLATION_DIR, +) +from evadb.database import EvaDBDatabase +from evadb.executor.abstract_executor import AbstractExecutor +from evadb.functions.decorators.utils import load_io_from_function_decorators +from evadb.models.storage.batch import Batch +from evadb.plan_nodes.create_function_plan import CreateFunctionPlan +from evadb.third_party.huggingface.create import gen_hf_io_catalog_entries +from evadb.utils.errors import FunctionIODefinitionError +from evadb.utils.generic_utils import ( + load_function_class_from_file, + try_to_import_ludwig, + try_to_import_torch, + try_to_import_ultralytics, +) +from evadb.utils.logging_manager import logger + + +class CreateFunctionExecutor(AbstractExecutor): + def __init__(self, db: EvaDBDatabase, node: CreateFunctionPlan): + super().__init__(db, node) + self.function_dir = Path(EvaDB_INSTALLATION_DIR) / "functions" + + def handle_huggingface_function(self): + """Handle HuggingFace Functions + + HuggingFace Functions are special Functions that are not loaded from a file. + So we do not need to call the setup method on them like we do for other Functions. + """ + # We need at least one deep learning framework for HuggingFace + # Torch or Tensorflow + try_to_import_torch() + impl_path = f"{self.function_dir}/abstract/hf_abstract_function.py" + io_list = gen_hf_io_catalog_entries(self.node.name, self.node.metadata) + return ( + self.node.name, + impl_path, + self.node.function_type, + io_list, + self.node.metadata, + ) + + def handle_ludwig_function(self): + """Handle ludwig Functions + + Use ludwig's auto_train engine to train/tune models. + """ + try_to_import_ludwig() + from ludwig.automl import auto_train + + assert ( + len(self.children) == 1 + ), "Create ludwig function expects 1 child, finds {}.".format( + len(self.children) + ) + + aggregated_batch_list = [] + child = self.children[0] + for batch in child.exec(): + aggregated_batch_list.append(batch) + aggregated_batch = Batch.concat(aggregated_batch_list, copy=False) + aggregated_batch.drop_column_alias() + + arg_map = {arg.key: arg.value for arg in self.node.metadata} + auto_train_results = auto_train( + dataset=aggregated_batch.frames, + target=arg_map["predict"], + tune_for_memory=arg_map.get("tune_for_memory", False), + time_limit_s=arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT), + output_directory=self.db.config.get_value("storage", "tmp_dir"), + ) + model_path = os.path.join( + self.db.config.get_value("storage", "model_dir"), self.node.name + ) + auto_train_results.best_model.save(model_path) + self.node.metadata.append( + FunctionMetadataCatalogEntry("model_path", model_path) + ) + + impl_path = Path(f"{self.function_dir}/ludwig.py").absolute().as_posix() + io_list = self._resolve_function_io(None) + return ( + self.node.name, + impl_path, + self.node.function_type, + io_list, + self.node.metadata, + ) + + def handle_ultralytics_function(self): + """Handle Ultralytics Functions""" + try_to_import_ultralytics() + + impl_path = ( + Path(f"{self.function_dir}/yolo_object_detector.py").absolute().as_posix() + ) + function = self._try_initializing_function( + impl_path, function_args=get_metadata_properties(self.node) + ) + io_list = self._resolve_function_io(function) + return ( + self.node.name, + impl_path, + self.node.function_type, + io_list, + self.node.metadata, + ) + + def handle_generic_function(self): + """Handle generic Functions + + Generic Functions are loaded from a file. We check for inputs passed by the user during CREATE or try to load io from decorators. + """ + impl_path = self.node.impl_path.absolute().as_posix() + function = self._try_initializing_function(impl_path) + io_list = self._resolve_function_io(function) + + return ( + self.node.name, + impl_path, + self.node.function_type, + io_list, + self.node.metadata, + ) + + def exec(self, *args, **kwargs): + """Create function executor + + Calls the catalog to insert a function catalog entry. + """ + # check catalog if it already has this function entry + if self.catalog().get_function_catalog_entry_by_name(self.node.name): + if self.node.if_not_exists: + msg = f"Function {self.node.name} already exists, nothing added." + yield Batch(pd.DataFrame([msg])) + return + else: + msg = f"Function {self.node.name} already exists." + logger.error(msg) + raise RuntimeError(msg) + + # if it's a type of HuggingFaceModel, override the impl_path + if self.node.function_type == "HuggingFace": + ( + name, + impl_path, + function_type, + io_list, + metadata, + ) = self.handle_huggingface_function() + elif self.node.function_type == "ultralytics": + ( + name, + impl_path, + function_type, + io_list, + metadata, + ) = self.handle_ultralytics_function() + elif self.node.function_type == "Ludwig": + ( + name, + impl_path, + function_type, + io_list, + metadata, + ) = self.handle_ludwig_function() + else: + ( + name, + impl_path, + function_type, + io_list, + metadata, + ) = self.handle_generic_function() + + self.catalog().insert_function_catalog_entry( + name, impl_path, function_type, io_list, metadata + ) + yield Batch( + pd.DataFrame( + [f"Function {self.node.name} successfully added to the database."] + ) + ) + + def _try_initializing_function( + self, impl_path: str, function_args: Dict = {} + ) -> FunctionCatalogEntry: + """Attempts to initialize function given the implementation file path and arguments. + + Args: + impl_path (str): The file path of the function implementation file. + function_args (Dict, optional): Dictionary of arguments to pass to the Function. Defaults to {}. + + Returns: + FunctionCatalogEntry: A FunctionCatalogEntry object that represents the initialized Function. + + Raises: + RuntimeError: If an error occurs while initializing the Function. + """ + + # load the function class from the file + try: + # loading the function class from the file + function = load_function_class_from_file(impl_path, self.node.name) + # initializing the function class calls the setup method internally + function(**function_args) + except Exception as e: + err_msg = f"Error creating Function: {str(e)}" + # logger.error(err_msg) + raise RuntimeError(err_msg) + + return function + + def _resolve_function_io( + self, function: FunctionCatalogEntry + ) -> List[FunctionIOCatalogEntry]: + """Private method that resolves the input/output definitions for a given Function. + It first searches for the input/outputs in the CREATE statement. If not found, it resolves them using decorators. If not found there as well, it raises an error. + + Args: + function (FunctionCatalogEntry): The function for which to resolve input and output definitions. + + Returns: + A List of FunctionIOCatalogEntry objects that represent the resolved input and + output definitions for the Function. + + Raises: + RuntimeError: If an error occurs while resolving the function input/output + definitions. + """ + io_list = [] + try: + if self.node.inputs: + io_list.extend(self.node.inputs) + else: + # try to load the inputs from decorators, the inputs from CREATE statement take precedence + io_list.extend( + load_io_from_function_decorators(function, is_input=True) + ) + + if self.node.outputs: + io_list.extend(self.node.outputs) + else: + # try to load the outputs from decorators, the outputs from CREATE statement take precedence + io_list.extend( + load_io_from_function_decorators(function, is_input=False) + ) + + except FunctionIODefinitionError as e: + err_msg = ( + f"Error creating Function, input/output definition incorrect: {str(e)}" + ) + logger.error(err_msg) + raise RuntimeError(err_msg) + + return io_list diff --git a/evadb/executor/create_index_executor.py b/evadb/executor/create_index_executor.py index c9fb87ce8c..0dee98b163 100644 --- a/evadb/executor/create_index_executor.py +++ b/evadb/executor/create_index_executor.py @@ -73,11 +73,11 @@ def _create_index(self): input_dim = -1 storage_engine = StorageEngine.factory(self.db, feat_catalog_entry) for input_batch in storage_engine.read(feat_catalog_entry): - if self.node.udf_func: - # Create index through UDF expression. - # UDF(input column) -> 2 dimension feature vector. + if self.node.function: + # Create index through function expression. + # Function(input column) -> 2 dimension feature vector. input_batch.modify_column_alias(feat_catalog_entry.name.lower()) - feat_batch = self.node.udf_func.evaluate(input_batch) + feat_batch = self.node.function.evaluate(input_batch) feat_batch.drop_column_alias() input_batch.drop_column_alias() feat = feat_batch.column_as_numpy_array("features") @@ -114,7 +114,7 @@ def _create_index(self): self.index_path, self.node.vector_store_type, feat_column, - self.node.udf_func.signature() if self.node.udf_func else None, + self.node.function.signature() if self.node.function else None, ) except Exception as e: # Delete index. diff --git a/evadb/executor/create_udf_executor.py b/evadb/executor/create_udf_executor.py index bee82ea4b7..7d68aab5f3 100644 --- a/evadb/executor/create_udf_executor.py +++ b/evadb/executor/create_udf_executor.py @@ -12,7 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import hashlib import os +import pickle from pathlib import Path from typing import Dict, List @@ -35,6 +37,7 @@ from evadb.utils.errors import UDFIODefinitionError from evadb.utils.generic_utils import ( load_udf_class_from_file, + try_to_import_forecast, try_to_import_ludwig, try_to_import_torch, try_to_import_ultralytics, @@ -53,7 +56,7 @@ def handle_huggingface_udf(self): HuggingFace UDFs are special UDFs that are not loaded from a file. So we do not need to call the setup method on them like we do for other UDFs. """ - # We need atleast one deep learning framework for HuggingFace + # We need at least one deep learning framework for HuggingFace # Torch or Tensorflow try_to_import_torch() impl_path = f"{self.udf_dir}/abstract/hf_abstract_udf.py" @@ -69,7 +72,7 @@ def handle_huggingface_udf(self): def handle_ludwig_udf(self): """Handle ludwig UDFs - Use ludwig's auto_train engine to train/tune models. + Use Ludwig's auto_train engine to train/tune models. """ try_to_import_ludwig() from ludwig.automl import auto_train @@ -128,6 +131,119 @@ def handle_ultralytics_udf(self): self.node.metadata, ) + def handle_forecasting_udf(self): + """Handle forecasting UDFs""" + aggregated_batch_list = [] + child = self.children[0] + for batch in child.exec(): + aggregated_batch_list.append(batch) + aggregated_batch = Batch.concat(aggregated_batch_list, copy=False) + aggregated_batch.drop_column_alias() + + arg_map = {arg.key: arg.value for arg in self.node.metadata} + if not self.node.impl_path: + impl_path = Path(f"{self.udf_dir}/forecast.py").absolute().as_posix() + else: + impl_path = self.node.impl_path.absolute().as_posix() + arg_map = {arg.key: arg.value for arg in self.node.metadata} + + if "model" not in arg_map.keys(): + arg_map["model"] = "AutoARIMA" + if "frequency" not in arg_map.keys(): + arg_map["frequency"] = "M" + + model_name = arg_map["model"] + frequency = arg_map["frequency"] + + data = aggregated_batch.frames.rename(columns={arg_map["predict"]: "y"}) + if "time" in arg_map.keys(): + aggregated_batch.frames.rename(columns={arg_map["time"]: "ds"}) + if "id" in arg_map.keys(): + aggregated_batch.frames.rename(columns={arg_map["id"]: "unique_id"}) + + if "unique_id" not in list(data.columns): + data["unique_id"] = ["test" for x in range(len(data))] + + if "ds" not in list(data.columns): + data["ds"] = [x + 1 for x in range(len(data))] + + try_to_import_forecast() + from statsforecast import StatsForecast + from statsforecast.models import AutoARIMA, AutoCES, AutoETS, AutoTheta + + model_dict = { + "AutoARIMA": AutoARIMA, + "AutoCES": AutoCES, + "AutoETS": AutoETS, + "AutoTheta": AutoTheta, + } + + season_dict = { # https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases + "H": 24, + "M": 12, + "Q": 4, + "SM": 24, + "BM": 12, + "BMS": 12, + "BQ": 4, + "BH": 24, + } + + new_freq = ( + frequency.split("-")[0] if "-" in frequency else frequency + ) # shortens longer frequencies like Q-DEC + season_length = season_dict[new_freq] if new_freq in season_dict else 1 + model = StatsForecast( + [model_dict[model_name](season_length=season_length)], freq=new_freq + ) + + model_dir = os.path.join( + self.db.config.get_value("storage", "model_dir"), self.node.name + ) + Path(model_dir).mkdir(parents=True, exist_ok=True) + model_path = os.path.join( + self.db.config.get_value("storage", "model_dir"), + self.node.name, + str(hashlib.sha256(data.to_string().encode()).hexdigest()) + ".pkl", + ) + + weight_file = Path(model_path) + + if not weight_file.exists(): + model.fit(data) + f = open(model_path, "wb") + pickle.dump(model, f) + f.close() + + arg_map_here = {"model_name": model_name, "model_path": model_path} + udf = self._try_initializing_udf(impl_path, arg_map_here) + io_list = self._resolve_udf_io(udf) + + metadata_here = [ + UdfMetadataCatalogEntry( + key="model_name", + value=model_name, + udf_id=None, + udf_name=None, + row_id=None, + ), + UdfMetadataCatalogEntry( + key="model_path", + value=model_path, + udf_id=None, + udf_name=None, + row_id=None, + ), + ] + + return ( + self.node.name, + impl_path, + self.node.udf_type, + io_list, + metadata_here, + ) + def handle_generic_udf(self): """Handle generic UDFs @@ -168,6 +284,8 @@ def exec(self, *args, **kwargs): name, impl_path, udf_type, io_list, metadata = self.handle_ultralytics_udf() elif self.node.udf_type == "Ludwig": name, impl_path, udf_type, io_list, metadata = self.handle_ludwig_udf() + elif self.node.udf_type == "Forecasting": + name, impl_path, udf_type, io_list, metadata = self.handle_forecasting_udf() else: name, impl_path, udf_type, io_list, metadata = self.handle_generic_udf() diff --git a/evadb/executor/drop_object_executor.py b/evadb/executor/drop_object_executor.py index b3cee68371..7a56674fe6 100644 --- a/evadb/executor/drop_object_executor.py +++ b/evadb/executor/drop_object_executor.py @@ -40,8 +40,8 @@ def exec(self, *args, **kwargs): elif self.node.object_type == ObjectType.INDEX: yield self._handle_drop_index(self.node.name, self.node.if_exists) - elif self.node.object_type == ObjectType.UDF: - yield self._handle_drop_udf(self.node.name, self.node.if_exists) + elif self.node.object_type == ObjectType.FUNCTION: + yield self._handle_drop_function(self.node.name, self.node.if_exists) def _handle_drop_table(self, table_name: str, if_exists: bool): if not self.catalog().check_table_exists(table_name): @@ -59,7 +59,7 @@ def _handle_drop_table(self, table_name: str, if_exists: bool): for col_obj in table_obj.columns: for cache in col_obj.dep_caches: - self.catalog().drop_udf_cache_catalog_entry(cache) + self.catalog().drop_function_cache_catalog_entry(cache) # todo also delete the indexes associated with the table assert self.catalog().delete_table_catalog_entry( @@ -73,27 +73,31 @@ def _handle_drop_table(self, table_name: str, if_exists: bool): ) ) - def _handle_drop_udf(self, udf_name: str, if_exists: bool): - # check catalog if it already has this udf entry - if not self.catalog().get_udf_catalog_entry_by_name(udf_name): - err_msg = f"UDF {udf_name} does not exist, therefore cannot be dropped." + def _handle_drop_function(self, function_name: str, if_exists: bool): + # check catalog if it already has this function entry + if not self.catalog().get_function_catalog_entry_by_name(function_name): + err_msg = ( + f"Function {function_name} does not exist, therefore cannot be dropped." + ) if if_exists: logger.warning(err_msg) return Batch(pd.DataFrame([err_msg])) else: raise RuntimeError(err_msg) else: - udf_entry = self.catalog().get_udf_catalog_entry_by_name(udf_name) - for cache in udf_entry.dep_caches: - self.catalog().drop_udf_cache_catalog_entry(cache) + function_entry = self.catalog().get_function_catalog_entry_by_name( + function_name + ) + for cache in function_entry.dep_caches: + self.catalog().drop_function_cache_catalog_entry(cache) # todo also delete the indexes associated with the table - self.catalog().delete_udf_catalog_entry_by_name(udf_name) + self.catalog().delete_function_catalog_entry_by_name(function_name) return Batch( pd.DataFrame( - {f"UDF {udf_name} successfully dropped"}, + {f"Function {function_name} successfully dropped"}, index=[0], ) ) diff --git a/evadb/executor/executor_utils.py b/evadb/executor/executor_utils.py index e2368f67f1..89cf1e4736 100644 --- a/evadb/executor/executor_utils.py +++ b/evadb/executor/executor_utils.py @@ -45,10 +45,12 @@ def apply_project( # persist stats of function expression for expr in project_list: for func_expr in expr.find_all(FunctionExpression): - if func_expr.udf_obj and func_expr._stats: - udf_id = func_expr.udf_obj.row_id - catalog.upsert_udf_cost_catalog_entry( - udf_id, func_expr.udf_obj.name, func_expr._stats.prev_cost + if func_expr.function_obj and func_expr._stats: + function_id = func_expr.function_obj.row_id + catalog.upsert_function_cost_catalog_entry( + function_id, + func_expr.function_obj.name, + func_expr._stats.prev_cost, ) return batch @@ -63,10 +65,10 @@ def apply_predicate( # persist stats of function expression for func_expr in predicate.find_all(FunctionExpression): - if func_expr.udf_obj and func_expr._stats: - udf_id = func_expr.udf_obj.row_id - catalog.upsert_udf_cost_catalog_entry( - udf_id, func_expr.udf_obj.name, func_expr._stats.prev_cost + if func_expr.function_obj and func_expr._stats: + function_id = func_expr.function_obj.row_id + catalog.upsert_function_cost_catalog_entry( + function_id, func_expr.function_obj.name, func_expr._stats.prev_cost ) return batch diff --git a/evadb/executor/function_scan_executor.py b/evadb/executor/function_scan_executor.py index 0f307b1b85..0c028e3881 100644 --- a/evadb/executor/function_scan_executor.py +++ b/evadb/executor/function_scan_executor.py @@ -42,10 +42,12 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]: res = self.func_expr.evaluate(lateral_input) # persist stats of function expression - if self.func_expr.udf_obj and self.func_expr._stats: - udf_id = self.func_expr.udf_obj.row_id - self.catalog().upsert_udf_cost_catalog_entry( - udf_id, self.func_expr.udf_obj.name, self.func_expr._stats.prev_cost + if self.func_expr.function_obj and self.func_expr._stats: + function_id = self.func_expr.function_obj.row_id + self.catalog().upsert_function_cost_catalog_entry( + function_id, + self.func_expr.function_obj.name, + self.func_expr._stats.prev_cost, ) if not res.empty(): diff --git a/evadb/executor/plan_executor.py b/evadb/executor/plan_executor.py index c593d2002d..8c7a8b47b7 100644 --- a/evadb/executor/plan_executor.py +++ b/evadb/executor/plan_executor.py @@ -19,8 +19,8 @@ from evadb.executor.apply_and_merge_executor import ApplyAndMergeExecutor from evadb.executor.create_database_executor import CreateDatabaseExecutor from evadb.executor.create_executor import CreateExecutor +from evadb.executor.create_function_executor import CreateFunctionExecutor from evadb.executor.create_index_executor import CreateIndexExecutor -from evadb.executor.create_udf_executor import CreateUDFExecutor from evadb.executor.delete_executor import DeleteExecutor from evadb.executor.drop_object_executor import DropObjectExecutor from evadb.executor.exchange_executor import ExchangeExecutor @@ -110,8 +110,8 @@ def _build_execution_tree( executor_node = DropObjectExecutor(db=self._db, node=plan) elif plan_opr_type == PlanOprType.INSERT: executor_node = InsertExecutor(db=self._db, node=plan) - elif plan_opr_type == PlanOprType.CREATE_UDF: - executor_node = CreateUDFExecutor(db=self._db, node=plan) + elif plan_opr_type == PlanOprType.CREATE_FUNCTION: + executor_node = CreateFunctionExecutor(db=self._db, node=plan) elif plan_opr_type == PlanOprType.LOAD_DATA: executor_node = LoadDataExecutor(db=self._db, node=plan) elif plan_opr_type == PlanOprType.GROUP_BY: diff --git a/evadb/executor/show_info_executor.py b/evadb/executor/show_info_executor.py index 9d95281c42..1f1b798711 100644 --- a/evadb/executor/show_info_executor.py +++ b/evadb/executor/show_info_executor.py @@ -30,13 +30,13 @@ def exec(self, *args, **kwargs): show_entries = [] assert ( - self.node.show_type is ShowType.UDFS or ShowType.TABLES + self.node.show_type is ShowType.FUNCTIONS or ShowType.TABLES ), f"Show command does not support type {self.node.show_type}" - if self.node.show_type is ShowType.UDFS: - udfs = self.catalog().get_all_udf_catalog_entries() - for udf in udfs: - show_entries.append(udf.display_format()) + if self.node.show_type is ShowType.FUNCTIONS: + functions = self.catalog().get_all_function_catalog_entries() + for function in functions: + show_entries.append(function.display_format()) elif self.node.show_type is ShowType.TABLES: tables = self.catalog().get_all_table_catalog_entries() for table in tables: diff --git a/evadb/executor/use_executor.py b/evadb/executor/use_executor.py index f8f66df7ac..dd1a94448a 100644 --- a/evadb/executor/use_executor.py +++ b/evadb/executor/use_executor.py @@ -33,6 +33,11 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]: self._database_name ) + if db_catalog_entry is None: + raise ExecutorError( + f"{self._database_name} data source does not exist. Use CREATE DATABASE to add a new data source." + ) + handler = get_database_handler( db_catalog_entry.engine, **db_catalog_entry.params, diff --git a/evadb/expression/function_expression.py b/evadb/expression/function_expression.py index bf578d1c54..c9ebe69909 100644 --- a/evadb/expression/function_expression.py +++ b/evadb/expression/function_expression.py @@ -18,17 +18,17 @@ import numpy as np import pandas as pd -from evadb.catalog.models.udf_catalog import UdfCatalogEntry -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry +from evadb.catalog.models.function_catalog import FunctionCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry from evadb.constants import NO_GPU from evadb.executor.execution_context import Context from evadb.expression.abstract_expression import AbstractExpression, ExpressionType +from evadb.functions.gpu_compatible import GPUCompatible from evadb.models.storage.batch import Batch from evadb.parser.alias import Alias -from evadb.udfs.gpu_compatible import GPUCompatible from evadb.utils.kv_cache import DiskKVCache from evadb.utils.logging_manager import logger -from evadb.utils.stats import UDFStats +from evadb.utils.stats import FunctionStats class FunctionExpression(AbstractExpression): @@ -66,11 +66,11 @@ def __init__( self._function_instance = None self._output = output self.alias = alias - self.udf_obj: UdfCatalogEntry = None - self.output_objs: List[UdfIOCatalogEntry] = [] + self.function_obj: FunctionCatalogEntry = None + self.output_objs: List[FunctionIOCatalogEntry] = [] self.projection_columns: List[str] = [] self._cache: FunctionExpressionCache = None - self._stats = UDFStats() + self._stats = FunctionStats() @property def name(self): @@ -104,7 +104,7 @@ def has_cache(self): return self._cache is not None def consolidate_stats(self): - if self.udf_obj is None: + if self.function_obj is None: return # if the function expression support cache only approximate using cache_miss entries. @@ -122,8 +122,8 @@ def consolidate_stats(self): def evaluate(self, batch: Batch, **kwargs) -> Batch: func = self._gpu_enabled_function() - # record the time taken for the udf execution - # note the udf might be using cache + # record the time taken for the function execution + # note the function might be using cache with self._stats.timer: # apply the function and project the required columns outcomes = self._apply_function_expression(func, batch, **kwargs) @@ -141,7 +141,7 @@ def evaluate(self, batch: Batch, **kwargs) -> Batch: self.consolidate_stats() except Exception as e: logger.warn( - f"Persisting Function Expression {str(self)} stats failed with {str(e)}" + f"Persisting FunctionExpression {str(self)} stats failed with {str(e)}" ) return outcomes @@ -149,7 +149,7 @@ def evaluate(self, batch: Batch, **kwargs) -> Batch: def signature(self) -> str: """It constructs the signature of the function expression. It traverses the children (function arguments) and compute signature for each - child. The output is in the form `udf_name[row_id](arg1, arg2, ...)`. + child. The output is in the form `function_name[row_id](arg1, arg2, ...)`. Returns: str: signature string @@ -158,7 +158,7 @@ def signature(self) -> str: for child in self.children: child_sigs.append(child.signature()) - func_sig = f"{self.name}[{self.udf_obj.row_id}]({','.join(child_sigs)})" + func_sig = f"{self.name}[{self.function_obj.row_id}]({','.join(child_sigs)})" return func_sig def _gpu_enabled_function(self): @@ -187,7 +187,7 @@ def _apply_function_expression(self, func: Callable, batch: Batch, **kwargs): if not self._cache: return func_args.apply_function_expression(func) - output_cols = [obj.name for obj in self.udf_obj.outputs] + output_cols = [obj.name for obj in self.function_obj.outputs] # 1. check cache # We are required to iterate over the batch row by row and check the cache. diff --git a/evadb/expression/tuple_value_expression.py b/evadb/expression/tuple_value_expression.py index 993508728b..b1f7b182cf 100644 --- a/evadb/expression/tuple_value_expression.py +++ b/evadb/expression/tuple_value_expression.py @@ -15,7 +15,7 @@ from typing import Union from evadb.catalog.models.column_catalog import ColumnCatalogEntry -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry from evadb.models.storage.batch import Batch from .abstract_expression import ( @@ -30,7 +30,7 @@ def __init__( self, name: str = None, table_alias: str = None, - col_object: Union[ColumnCatalogEntry, UdfIOCatalogEntry] = None, + col_object: Union[ColumnCatalogEntry, FunctionIOCatalogEntry] = None, col_alias=None, ): super().__init__(ExpressionType.TUPLE_VALUE, rtype=ExpressionReturnType.INVALID) @@ -52,11 +52,11 @@ def name(self) -> str: return self._name @property - def col_object(self) -> Union[ColumnCatalogEntry, UdfIOCatalogEntry]: + def col_object(self) -> Union[ColumnCatalogEntry, FunctionIOCatalogEntry]: return self._col_object @col_object.setter - def col_object(self, value: Union[ColumnCatalogEntry, UdfIOCatalogEntry]): + def col_object(self, value: Union[ColumnCatalogEntry, FunctionIOCatalogEntry]): self._col_object = value @property @@ -74,22 +74,22 @@ def signature(self): """It constructs the signature of the tuple value expression. It assumes the col_object attribute is populated by the binder with the catalog entries. For a standard column in the table, it returns `table_name.col_name`, - and for UDF output columns it returns `udf_name.col_name` + and for function output columns it returns `function_name.col_name` Raises: - ValueError: If the col_object is not a `Union[ColumnCatalogEntry, UdfIOCatalogEntry]`. This can occur if the expression has not been bound using the binder. + ValueError: If the col_object is not a `Union[ColumnCatalogEntry, FunctionIOCatalogEntry]`. This can occur if the expression has not been bound using the binder. Returns: str: signature string """ assert isinstance(self.col_object, ColumnCatalogEntry) or isinstance( - self.col_object, UdfIOCatalogEntry - ), f"Unsupported type of self.col_object {type(self.col_object)}, expected ColumnCatalogEntry or UdfIOCatalogEntry" + self.col_object, FunctionIOCatalogEntry + ), f"Unsupported type of self.col_object {type(self.col_object)}, expected ColumnCatalogEntry or FunctionIOCatalogEntry" col_name = self.col_object.name row_id = self.col_object.row_id if isinstance(self.col_object, ColumnCatalogEntry): return f"{self.col_object.table_name}.{col_name}[{row_id}]" - elif isinstance(self.col_object, UdfIOCatalogEntry): - return f"{self.col_object.udf_name}.{col_name}[{row_id}]" + elif isinstance(self.col_object, FunctionIOCatalogEntry): + return f"{self.col_object.function_name}.{col_name}[{row_id}]" def __eq__(self, other): is_subtree_equal = super().__eq__(other) diff --git a/evadb/udfs/README.md b/evadb/functions/README.md similarity index 100% rename from evadb/udfs/README.md rename to evadb/functions/README.md diff --git a/evadb/udfs/__init__.py b/evadb/functions/__init__.py similarity index 100% rename from evadb/udfs/__init__.py rename to evadb/functions/__init__.py diff --git a/evadb/udfs/abstract/__init__.py b/evadb/functions/abstract/__init__.py similarity index 100% rename from evadb/udfs/abstract/__init__.py rename to evadb/functions/abstract/__init__.py diff --git a/evadb/udfs/abstract/abstract_udf.py b/evadb/functions/abstract/abstract_function.py similarity index 85% rename from evadb/udfs/abstract/abstract_udf.py rename to evadb/functions/abstract/abstract_function.py index c86c78652c..80061a81e3 100644 --- a/evadb/udfs/abstract/abstract_udf.py +++ b/evadb/functions/abstract/abstract_function.py @@ -21,9 +21,9 @@ InputType = Union[pd.DataFrame, ArrayLike] -class AbstractUDF(metaclass=ABCMeta): +class AbstractFunction(metaclass=ABCMeta): """ - Abstract class for UDFs. All the UDFs in EvaDB will inherit from this. + Abstract class for Functions. All the Functions in EvaDB will inherit from this. Load and initialize the machine learning model in the __init__. @@ -38,7 +38,7 @@ def __call__(self, *args, **kwargs): def __str__(self): return self.name - """Abstract Methods all UDFs must implement. """ + """Abstract Methods all Functions must implement. """ @abstractmethod def setup(self, *args, **kwargs) -> None: @@ -50,7 +50,7 @@ def setup(self, *args, **kwargs) -> None: @abstractmethod def forward(self, frames: InputType) -> InputType: """ - Implement UDF function call by overriding this function. + Implement function function call by overriding this function. Gets called automatically by __call__. """ pass @@ -61,7 +61,7 @@ def name(self) -> str: pass -class AbstractClassifierUDF(AbstractUDF): +class AbstractClassifierFunction(AbstractFunction): @property @abstractmethod def labels(self) -> List[str]: @@ -72,7 +72,7 @@ def labels(self) -> List[str]: pass -class AbstractTransformationUDF(AbstractUDF): +class AbstractTransformationFunction(AbstractFunction): @abstractmethod def transform(self, frames: ArrayLike) -> ArrayLike: """ diff --git a/evadb/udfs/abstract/hf_abstract_udf.py b/evadb/functions/abstract/hf_abstract_function.py similarity index 85% rename from evadb/udfs/abstract/hf_abstract_udf.py rename to evadb/functions/abstract/hf_abstract_function.py index e85c3b2786..3945ea0ddd 100644 --- a/evadb/udfs/abstract/hf_abstract_udf.py +++ b/evadb/functions/abstract/hf_abstract_function.py @@ -16,13 +16,13 @@ import pandas as pd -from evadb.catalog.models.udf_catalog import UdfCatalogEntry -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.catalog.models.function_catalog import FunctionCatalogEntry +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.gpu_compatible import GPUCompatible from evadb.utils.generic_utils import try_to_import_transformers -class AbstractHFUdf(AbstractUDF, GPUCompatible): +class AbstractHFFunction(AbstractFunction, GPUCompatible): """ An abstract class for all HuggingFace models. @@ -41,10 +41,12 @@ class AbstractHFUdf(AbstractUDF, GPUCompatible): def name(self) -> str: return "GenericHuggingfaceModel" - def __init__(self, udf_obj: UdfCatalogEntry, device: int = -1, *args, **kwargs): + def __init__( + self, function_obj: FunctionCatalogEntry, device: int = -1, *args, **kwargs + ): super().__init__(*args, **kwargs) pipeline_args = self.default_pipeline_args - for entry in udf_obj.metadata: + for entry in function_obj.metadata: if entry.value.isnumeric(): pipeline_args[entry.key] = int(entry.value) else: @@ -53,7 +55,7 @@ def __init__(self, udf_obj: UdfCatalogEntry, device: int = -1, *args, **kwargs): try_to_import_transformers() from transformers import pipeline - self.hf_udf_obj = pipeline(**pipeline_args, device=device) + self.hf_function_obj = pipeline(**pipeline_args, device=device) def setup(self, *args, **kwargs) -> None: super().setup(*args, **kwargs) @@ -97,7 +99,7 @@ def output_formatter(self, outputs: Any): def forward(self, inputs, *args, **kwargs) -> pd.DataFrame: hf_input = self.input_formatter(inputs) - hf_output = self.hf_udf_obj(hf_input, *args, **kwargs) + hf_output = self.hf_function_obj(hf_input, *args, **kwargs) evadb_output = self.output_formatter(hf_output) return evadb_output @@ -105,5 +107,5 @@ def to_device(self, device: str) -> GPUCompatible: try_to_import_transformers() from transformers import pipeline - self.hf_udf_obj = pipeline(**self.pipeline_args, device=device) + self.hf_function_obj = pipeline(**self.pipeline_args, device=device) return self diff --git a/evadb/udfs/abstract/pytorch_abstract_udf.py b/evadb/functions/abstract/pytorch_abstract_function.py similarity index 91% rename from evadb/udfs/abstract/pytorch_abstract_udf.py rename to evadb/functions/abstract/pytorch_abstract_function.py index 16f3a48aa9..763f3658f7 100644 --- a/evadb/udfs/abstract/pytorch_abstract_udf.py +++ b/evadb/functions/abstract/pytorch_abstract_function.py @@ -18,11 +18,11 @@ from numpy.typing import ArrayLike from evadb.configuration.configuration_manager import ConfigurationManager -from evadb.udfs.abstract.abstract_udf import ( - AbstractClassifierUDF, - AbstractTransformationUDF, +from evadb.functions.abstract.abstract_function import ( + AbstractClassifierFunction, + AbstractTransformationFunction, ) -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.functions.gpu_compatible import GPUCompatible from evadb.utils.generic_utils import ( try_to_import_pillow, try_to_import_torch, @@ -38,7 +38,9 @@ from torchvision.transforms import Compose, transforms -class PytorchAbstractClassifierUDF(AbstractClassifierUDF, nn.Module, GPUCompatible): +class PytorchAbstractClassifierFunction( + AbstractClassifierFunction, nn.Module, GPUCompatible +): """ A pytorch based classifier. Used to make sure we make maximum utilization of features provided by pytorch without reinventing the wheel. @@ -107,7 +109,7 @@ def to_device(self, device: str) -> GPUCompatible: return self.to(torch.device("cuda:{}".format(device))) -class PytorchAbstractTransformationUDF(AbstractTransformationUDF, Compose): +class PytorchAbstractTransformationFunction(AbstractTransformationFunction, Compose): """ Use PyTorch torchvision transforms as EvaDB transforms. """ diff --git a/evadb/udfs/abstract/tracker_abstract_udf.py b/evadb/functions/abstract/tracker_abstract_function.py similarity index 91% rename from evadb/udfs/abstract/tracker_abstract_udf.py rename to evadb/functions/abstract/tracker_abstract_function.py index 10f00bd40e..bfd659b399 100644 --- a/evadb/udfs/abstract/tracker_abstract_udf.py +++ b/evadb/functions/abstract/tracker_abstract_function.py @@ -18,12 +18,12 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -class EvaDBTrackerAbstractUDF(AbstractUDF): +class EvaDBTrackerAbstractFunction(AbstractFunction): """ An abstract class for all EvaDB object trackers. """ @@ -31,7 +31,7 @@ class EvaDBTrackerAbstractUDF(AbstractUDF): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - @setup(cacheable=False, udf_type="object_tracker", batchable=False) + @setup(cacheable=False, function_type="object_tracker", batchable=False) def setup(self, *args, **kwargs): super().setup(*args, **kwargs) diff --git a/evadb/udfs/asl_20_actions_map.pkl b/evadb/functions/asl_20_actions_map.pkl similarity index 100% rename from evadb/udfs/asl_20_actions_map.pkl rename to evadb/functions/asl_20_actions_map.pkl diff --git a/evadb/udfs/asl_action_recognition.py b/evadb/functions/asl_action_recognition.py similarity index 95% rename from evadb/udfs/asl_action_recognition.py rename to evadb/functions/asl_action_recognition.py index d6ba42e372..f4493a1d2d 100644 --- a/evadb/udfs/asl_action_recognition.py +++ b/evadb/functions/asl_action_recognition.py @@ -19,11 +19,13 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.pytorch_abstract_udf import PytorchAbstractClassifierUDF +from evadb.functions.abstract.pytorch_abstract_function import ( + PytorchAbstractClassifierFunction, +) from evadb.utils.generic_utils import try_to_import_torch, try_to_import_torchvision -class ASLActionRecognition(PytorchAbstractClassifierUDF): +class ASLActionRecognition(PytorchAbstractClassifierFunction): @property def name(self) -> str: return "ASLActionRecognition" diff --git a/evadb/udfs/chatgpt.py b/evadb/functions/chatgpt.py similarity index 92% rename from evadb/udfs/chatgpt.py rename to evadb/functions/chatgpt.py index ba6e9cb87a..c7d9ab121e 100644 --- a/evadb/udfs/chatgpt.py +++ b/evadb/functions/chatgpt.py @@ -21,9 +21,9 @@ from evadb.catalog.catalog_type import NdArrayType from evadb.configuration.configuration_manager import ConfigurationManager -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe from evadb.utils.generic_utils import try_to_import_openai _VALID_CHAT_COMPLETION_MODEL = [ @@ -36,7 +36,7 @@ ] -class ChatGPT(AbstractUDF): +class ChatGPT(AbstractFunction): """ Arguments: model (str) : ID of the OpenAI model to use. Refer to '_VALID_CHAT_COMPLETION_MODEL' for a list of supported models. @@ -55,12 +55,12 @@ class ChatGPT(AbstractUDF): Example Usage: Assume we have the transcripts for a few videos stored in a table 'video_transcripts' in a column named 'text'. - If the user wants to retrieve the summary of each video, the ChatGPT UDF can be used as: + If the user wants to retrieve the summary of each video, the ChatGPT function can be used as: query = "Generate the summary of the video" cursor.table("video_transcripts").select(f"ChatGPT({query}, text)") - In the above UDF invocation, the 'query' passed would be the user task to generate video summaries, and the + In the above function invocation, the 'query' passed would be the user task to generate video summaries, and the 'content' passed would be the video transcripts that need to be used in order to generate the summary. Since no prompt is passed, the default system prompt will be used. @@ -80,7 +80,7 @@ class ChatGPT(AbstractUDF): def name(self) -> str: return "ChatGPT" - @setup(cacheable=False, udf_type="chat-completion", batchable=True) + @setup(cacheable=False, function_type="chat-completion", batchable=True) def setup( self, model="gpt-3.5-turbo", @@ -140,7 +140,7 @@ def completion_with_backoff(**kwargs): prompt = text_df.iloc[0, 2] # openai api currently supports answers to a single prompt only - # so this udf is designed for that + # so this function is designed for that results = [] for query, content in zip(queries, content): diff --git a/evadb/udfs/decorators/__init__.py b/evadb/functions/decorators/__init__.py similarity index 100% rename from evadb/udfs/decorators/__init__.py rename to evadb/functions/decorators/__init__.py diff --git a/evadb/udfs/decorators/decorators.py b/evadb/functions/decorators/decorators.py similarity index 75% rename from evadb/udfs/decorators/decorators.py rename to evadb/functions/decorators/decorators.py index ede1715403..097de8a25b 100644 --- a/evadb/udfs/decorators/decorators.py +++ b/evadb/functions/decorators/decorators.py @@ -16,27 +16,29 @@ from typing import List -from evadb.udfs.decorators.io_descriptors.abstract_types import IOArgument +from evadb.functions.decorators.io_descriptors.abstract_types import IOArgument -def setup(cacheable: bool = False, udf_type: str = "Abstract", batchable: bool = True): +def setup( + cacheable: bool = False, function_type: str = "Abstract", batchable: bool = True +): """decorator for the setup function. It will be used to set the cache, batching and - udf_type parameters in the catalog + function_type parameters in the catalog Args: - use_cache (bool): True if the udf should be cached - udf_type (str): Type of the udf - batch (bool): True if the udf should be batched + use_cache (bool): True if the function should be cached + function_type (str): Type of the function + batch (bool): True if the function should be batched """ def inner_fn(arg_fn): def wrapper(*args, **kwargs): - # calling the setup function defined by the user inside the udf implementation + # calling the setup function defined by the user inside the function implementation arg_fn(*args, **kwargs) tags = {} tags["cacheable"] = cacheable - tags["udf_type"] = udf_type + tags["function_type"] = function_type tags["batchable"] = batchable wrapper.tags = tags return wrapper @@ -48,13 +50,13 @@ def forward(input_signatures: List[IOArgument], output_signatures: List[IOArgume """decorator for the forward function. It will be used to set the input and output. Args: - input_signature (List[IOArgument]): List of input arguments for the udf - output_signature ( List[IOArgument])): List of output arguments for the udf + input_signature (List[IOArgument]): List of input arguments for the function + output_signature ( List[IOArgument])): List of output arguments for the function """ def inner_fn(arg_fn): def wrapper(*args): - # calling the forward function defined by the user inside the udf implementation + # calling the forward function defined by the user inside the function implementation return arg_fn(*args) tags = {} diff --git a/evadb/udfs/decorators/io_descriptors/__init__.py b/evadb/functions/decorators/io_descriptors/__init__.py similarity index 100% rename from evadb/udfs/decorators/io_descriptors/__init__.py rename to evadb/functions/decorators/io_descriptors/__init__.py diff --git a/evadb/udfs/decorators/io_descriptors/abstract_types.py b/evadb/functions/decorators/io_descriptors/abstract_types.py similarity index 86% rename from evadb/udfs/decorators/io_descriptors/abstract_types.py rename to evadb/functions/decorators/io_descriptors/abstract_types.py index 549b00b6fa..e8b8ad4b20 100644 --- a/evadb/udfs/decorators/io_descriptors/abstract_types.py +++ b/evadb/functions/decorators/io_descriptors/abstract_types.py @@ -16,12 +16,12 @@ from typing import List, Tuple, Type from evadb.catalog.catalog_type import ColumnType, NdArrayType -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry class IOArgument(ABC): """ - Base class for representing inputs/outputs (IO) of a UDF using decorators. This class defines methods + Base class for representing inputs/outputs (IO) of a function using decorators. This class defines methods that are common for all the IO arguments. """ @@ -32,7 +32,7 @@ def __init__(self) -> None: @abstractmethod def generate_catalog_entries( self, *args, **kwargs - ) -> List[Type[UdfIOCatalogEntry]]: + ) -> List[Type[FunctionIOCatalogEntry]]: """Generates the catalog IO entries from the Argument. Returns: @@ -69,7 +69,9 @@ def __init__( self.array_type = array_type self.array_dimensions = array_dimensions - def generate_catalog_entries(self, is_input=False) -> List[Type[UdfIOCatalogEntry]]: + def generate_catalog_entries( + self, is_input=False + ) -> List[Type[FunctionIOCatalogEntry]]: """Generates the catalog IO entries from the Argument. Returns: @@ -77,7 +79,7 @@ def generate_catalog_entries(self, is_input=False) -> List[Type[UdfIOCatalogEntr """ return [ - UdfIOCatalogEntry( + FunctionIOCatalogEntry( name=self.name, type=self.type, is_nullable=self.is_nullable, diff --git a/evadb/udfs/decorators/io_descriptors/data_types.py b/evadb/functions/decorators/io_descriptors/data_types.py similarity index 88% rename from evadb/udfs/decorators/io_descriptors/data_types.py rename to evadb/functions/decorators/io_descriptors/data_types.py index d99bc5cfed..45a1dc4d28 100644 --- a/evadb/udfs/decorators/io_descriptors/data_types.py +++ b/evadb/functions/decorators/io_descriptors/data_types.py @@ -15,12 +15,12 @@ from typing import List, Tuple, Type from evadb.catalog.catalog_type import ColumnType, Dimension, NdArrayType -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry -from evadb.udfs.decorators.io_descriptors.abstract_types import ( +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.functions.decorators.io_descriptors.abstract_types import ( IOArgument, IOColumnArgument, ) -from evadb.utils.errors import UDFIODefinitionError +from evadb.utils.errors import FunctionIODefinitionError class NumpyArray(IOColumnArgument): @@ -70,7 +70,9 @@ def __init__(self, columns, column_types=[], column_shapes=[]) -> None: self.column_types = column_types self.column_shapes = column_shapes - def generate_catalog_entries(self, is_input=False) -> List[Type[UdfIOCatalogEntry]]: + def generate_catalog_entries( + self, is_input=False + ) -> List[Type[FunctionIOCatalogEntry]]: catalog_entries = [] if not self.column_types: @@ -83,7 +85,7 @@ def generate_catalog_entries(self, is_input=False) -> List[Type[UdfIOCatalogEntr if len(self.columns) != len(self.column_types) or len(self.columns) != len( self.column_shapes ): - raise UDFIODefinitionError( + raise FunctionIODefinitionError( "columns, column_types and column_shapes should be of same length if specified. " ) @@ -91,7 +93,7 @@ def generate_catalog_entries(self, is_input=False) -> List[Type[UdfIOCatalogEntr self.columns, self.column_types, self.column_shapes ): catalog_entries.append( - UdfIOCatalogEntry( + FunctionIOCatalogEntry( name=column_name, type=ColumnType.NDARRAY, is_nullable=False, diff --git a/evadb/udfs/decorators/utils.py b/evadb/functions/decorators/utils.py similarity index 63% rename from evadb/udfs/decorators/utils.py rename to evadb/functions/decorators/utils.py index c39a991b28..b5a9611143 100644 --- a/evadb/udfs/decorators/utils.py +++ b/evadb/functions/decorators/utils.py @@ -14,30 +14,30 @@ # limitations under the License. from typing import List, Type -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.functions.abstract.abstract_function import AbstractFunction -def load_io_from_udf_decorators( - udf: Type[AbstractUDF], is_input=False -) -> List[Type[UdfIOCatalogEntry]]: - """Load the inputs/outputs from the udf decorators and return a list of UdfIOCatalogEntry objects +def load_io_from_function_decorators( + function: Type[AbstractFunction], is_input=False +) -> List[Type[FunctionIOCatalogEntry]]: + """Load the inputs/outputs from the function decorators and return a list of FunctionIOCatalogEntry objects Args: - udf (Object): UDF object + function (Object): Function object is_input (bool, optional): True if inputs are to be loaded. Defaults to False. Returns: - Type[UdfIOCatalogEntry]: UdfIOCatalogEntry object created from the input decorator in setup + Type[FunctionIOCatalogEntry]: FunctionIOCatalogEntry object created from the input decorator in setup """ tag_key = "input" if is_input else "output" io_signature = None - if hasattr(udf.forward, "tags") and tag_key in udf.forward.tags: - io_signature = udf.forward.tags[tag_key] + if hasattr(function.forward, "tags") and tag_key in function.forward.tags: + io_signature = function.forward.tags[tag_key] else: # Attempt to populate from the parent class and stop at the first parent class # where the required tags are found. - for base_class in udf.__bases__: + for base_class in function.__bases__: if hasattr(base_class, "forward") and hasattr(base_class.forward, "tags"): if tag_key in base_class.forward.tags: io_signature = base_class.forward.tags[tag_key] @@ -45,7 +45,7 @@ def load_io_from_udf_decorators( assert ( io_signature is not None - ), f"Cannot infer io signature from the decorator for {udf}." + ), f"Cannot infer io signature from the decorator for {function}." result_list = [] for io in io_signature: diff --git a/evadb/udfs/emotion_detector.py b/evadb/functions/emotion_detector.py similarity index 97% rename from evadb/udfs/emotion_detector.py rename to evadb/functions/emotion_detector.py index ccda32badf..5bab85c336 100644 --- a/evadb/udfs/emotion_detector.py +++ b/evadb/functions/emotion_detector.py @@ -19,7 +19,9 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.pytorch_abstract_udf import PytorchAbstractClassifierUDF +from evadb.functions.abstract.pytorch_abstract_function import ( + PytorchAbstractClassifierFunction, +) from evadb.utils.generic_utils import ( try_to_import_pillow, try_to_import_torch, @@ -54,7 +56,7 @@ } -class EmotionDetector(PytorchAbstractClassifierUDF): +class EmotionDetector(PytorchAbstractClassifierFunction): """ Arguments: threshold (float): Threshold for classifier confidence score diff --git a/evadb/udfs/face_detector.py b/evadb/functions/face_detector.py similarity index 93% rename from evadb/udfs/face_detector.py rename to evadb/functions/face_detector.py index eeef81dd21..ea5f5f07eb 100644 --- a/evadb/udfs/face_detector.py +++ b/evadb/functions/face_detector.py @@ -18,8 +18,8 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.abstract_udf import AbstractClassifierUDF -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.functions.abstract.abstract_function import AbstractClassifierFunction +from evadb.functions.gpu_compatible import GPUCompatible from evadb.utils.generic_utils import ( try_to_import_facenet_pytorch, try_to_import_torch, @@ -28,7 +28,7 @@ from evadb.utils.logging_manager import logger -class FaceDetector(AbstractClassifierUDF, GPUCompatible): +class FaceDetector(AbstractClassifierFunction, GPUCompatible): """ Arguments: threshold (float): Threshold for classifier confidence score diff --git a/evadb/udfs/fastrcnn_object_detector.py b/evadb/functions/fastrcnn_object_detector.py similarity index 93% rename from evadb/udfs/fastrcnn_object_detector.py rename to evadb/functions/fastrcnn_object_detector.py index e75526b23c..9db7d9fc8e 100644 --- a/evadb/udfs/fastrcnn_object_detector.py +++ b/evadb/functions/fastrcnn_object_detector.py @@ -18,16 +18,18 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.pytorch_abstract_udf import PytorchAbstractClassifierUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import ( +from evadb.functions.abstract.pytorch_abstract_function import ( + PytorchAbstractClassifierFunction, +) +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import ( PandasDataframe, PyTorchTensor, ) from evadb.utils.generic_utils import try_to_import_torch, try_to_import_torchvision -class FastRCNNObjectDetector(PytorchAbstractClassifierUDF): +class FastRCNNObjectDetector(PytorchAbstractClassifierFunction): """ Arguments: threshold (float): Threshold for classifier confidence score @@ -38,7 +40,7 @@ class FastRCNNObjectDetector(PytorchAbstractClassifierUDF): def name(self) -> str: return "fastrcnn" - @setup(cacheable=True, udf_type="object_detection", batchable=True) + @setup(cacheable=True, function_type="object_detection", batchable=True) def setup(self, threshold=0.85): try_to_import_torch() try_to_import_torchvision() diff --git a/evadb/udfs/feature_extractor.py b/evadb/functions/feature_extractor.py similarity index 91% rename from evadb/udfs/feature_extractor.py rename to evadb/functions/feature_extractor.py index 8ab6db1c0a..df8190278b 100644 --- a/evadb/udfs/feature_extractor.py +++ b/evadb/functions/feature_extractor.py @@ -20,10 +20,12 @@ from torch import Tensor from torchvision import models -from evadb.udfs.abstract.pytorch_abstract_udf import PytorchAbstractClassifierUDF +from evadb.functions.abstract.pytorch_abstract_function import ( + PytorchAbstractClassifierFunction, +) -class FeatureExtractor(PytorchAbstractClassifierUDF): +class FeatureExtractor(PytorchAbstractClassifierFunction): """ """ def setup(self): diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py new file mode 100644 index 0000000000..52e1cc2cb4 --- /dev/null +++ b/evadb/functions/forecast.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pickle + +import pandas as pd + +from evadb.catalog.catalog_type import NdArrayType +from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.udfs.decorators.decorators import forward, setup +from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe + + +class ForecastModel(AbstractUDF): + @property + def name(self) -> str: + return "ForecastModel" + + @setup(cacheable=False, udf_type="Forecasting", batchable=True) + def setup(self, model_name: str, model_path: str): + f = open(model_path, "rb") + loaded_model = pickle.load(f) + f.close() + self.model = loaded_model + self.model_name = model_name + + @forward( + input_signatures=[], + output_signatures=[ + PandasDataframe( + columns=["y"], + column_types=[ + NdArrayType.FLOAT32, + ], + column_shapes=[(None,)], + ) + ], + ) + def forward(self, data) -> pd.DataFrame: + horizon = list(data.iloc[:, -1])[0] + assert ( + type(horizon) is int + ), "Forecast UDF expects integral horizon in parameter." + forecast_df = self.model.predict(h=horizon) + forecast_df = forecast_df.rename(columns={self.model_name: "y"}) + return pd.DataFrame( + forecast_df, + columns=[ + "y", + ], + ) diff --git a/evadb/udfs/udf_bootstrap_queries.py b/evadb/functions/function_bootstrap_queries.py similarity index 59% rename from evadb/udfs/udf_bootstrap_queries.py rename to evadb/functions/function_bootstrap_queries.py index 2b358ea8d8..cd75e9a655 100644 --- a/evadb/udfs/udf_bootstrap_queries.py +++ b/evadb/functions/function_bootstrap_queries.py @@ -20,7 +20,7 @@ NDARRAY_DIR = "ndarray" TUTORIALS_DIR = "tutorials" -DummyObjectDetector_udf_query = """CREATE UDF IF NOT EXISTS DummyObjectDetector +DummyObjectDetector_function_query = """CREATE FUNCTION IF NOT EXISTS DummyObjectDetector INPUT (Frame_Array NDARRAY INT8(3, ANYDIM, ANYDIM)) OUTPUT (label NDARRAY STR(1)) TYPE Classification @@ -29,7 +29,7 @@ EvaDB_INSTALLATION_DIR ) -DummyMultiObjectDetector_udf_query = """CREATE UDF +DummyMultiObjectDetector_function_query = """CREATE FUNCTION IF NOT EXISTS DummyMultiObjectDetector INPUT (Frame_Array NDARRAY INT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(2)) @@ -39,7 +39,7 @@ EvaDB_INSTALLATION_DIR ) -DummyFeatureExtractor_udf_query = """CREATE UDF +DummyFeatureExtractor_function_query = """CREATE FUNCTION IF NOT EXISTS DummyFeatureExtractor INPUT (Frame_Array NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (features NDARRAY FLOAT32(1, ANYDIM)) @@ -49,156 +49,156 @@ EvaDB_INSTALLATION_DIR ) -fuzzy_udf_query = """CREATE UDF IF NOT EXISTS FuzzDistance +fuzzy_function_query = """CREATE FUNCTION IF NOT EXISTS FuzzDistance INPUT (Input_Array1 NDARRAY ANYTYPE, Input_Array2 NDARRAY ANYTYPE) OUTPUT (distance FLOAT(32, 7)) - TYPE NdarrayUDF - IMPL "{}/udfs/{}/fuzzy_join.py"; + TYPE NdarrayFunction + IMPL "{}/functions/{}/fuzzy_join.py"; """.format( EvaDB_INSTALLATION_DIR, NDARRAY_DIR ) -ArrayCount_udf_query = """CREATE UDF +ArrayCount_function_query = """CREATE FUNCTION IF NOT EXISTS ArrayCount INPUT (Input_Array NDARRAY ANYTYPE, Search_Key ANYTYPE) OUTPUT (key_count INTEGER) - TYPE NdarrayUDF - IMPL "{}/udfs/{}/array_count.py"; + TYPE NdarrayFunction + IMPL "{}/functions/{}/array_count.py"; """.format( EvaDB_INSTALLATION_DIR, NDARRAY_DIR ) -Crop_udf_query = """CREATE UDF IF NOT EXISTS Crop +Crop_function_query = """CREATE FUNCTION IF NOT EXISTS Crop INPUT (Frame_Array NDARRAY UINT8(3, ANYDIM, ANYDIM), bboxes NDARRAY FLOAT32(ANYDIM, 4)) OUTPUT (Cropped_Frame_Array NDARRAY UINT8(3, ANYDIM, ANYDIM)) - TYPE NdarrayUDF - IMPL "{}/udfs/{}/crop.py"; + TYPE NdarrayFunction + IMPL "{}/functions/{}/crop.py"; """.format( EvaDB_INSTALLATION_DIR, NDARRAY_DIR ) -Open_udf_query = """CREATE UDF IF NOT EXISTS Open +Open_function_query = """CREATE FUNCTION IF NOT EXISTS Open INPUT (img_path TEXT(1000)) OUTPUT (data NDARRAY UINT8(3, ANYDIM, ANYDIM)) - TYPE NdarrayUDF - IMPL "{}/udfs/{}/open.py"; + TYPE NdarrayFunction + IMPL "{}/functions/{}/open.py"; """.format( EvaDB_INSTALLATION_DIR, NDARRAY_DIR ) -Similarity_udf_query = """CREATE UDF IF NOT EXISTS Similarity +Similarity_function_query = """CREATE FUNCTION IF NOT EXISTS Similarity INPUT (Frame_Array_Open NDARRAY UINT8(3, ANYDIM, ANYDIM), Frame_Array_Base NDARRAY UINT8(3, ANYDIM, ANYDIM), Feature_Extractor_Name TEXT(100)) OUTPUT (distance FLOAT(32, 7)) - TYPE NdarrayUDF - IMPL "{}/udfs/{}/similarity.py"; + TYPE NdarrayFunction + IMPL "{}/functions/{}/similarity.py"; """.format( EvaDB_INSTALLATION_DIR, NDARRAY_DIR ) -Unnest_udf_query = """CREATE UDF IF NOT EXISTS Unnest +Unnest_function_query = """CREATE FUNCTION IF NOT EXISTS Unnest INPUT (inp NDARRAY ANYTYPE) OUTPUT (out ANYTYPE) - TYPE NdarrayUDF - IMPL "{}/udfs/{}/unnest.py"; + TYPE NdarrayFunction + IMPL "{}/functions/{}/unnest.py"; """.format( EvaDB_INSTALLATION_DIR, NDARRAY_DIR ) -Fastrcnn_udf_query = """CREATE UDF IF NOT EXISTS FastRCNNObjectDetector +Fastrcnn_function_query = """CREATE FUNCTION IF NOT EXISTS FastRCNNObjectDetector INPUT (Frame_Array NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(ANYDIM), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE Classification - IMPL '{}/udfs/fastrcnn_object_detector.py'; + IMPL '{}/functions/fastrcnn_object_detector.py'; """.format( EvaDB_INSTALLATION_DIR ) -Yolo_udf_query = """CREATE UDF IF NOT EXISTS Yolo +Yolo_function_query = """CREATE FUNCTION IF NOT EXISTS Yolo TYPE ultralytics - 'model' 'yolov8m.pt'; + MODEL 'yolov8m.pt'; """ -face_detection_udf_query = """CREATE UDF IF NOT EXISTS FaceDetector +face_detection_function_query = """CREATE FUNCTION IF NOT EXISTS FaceDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE FaceDetection - IMPL '{}/udfs/face_detector.py'; + IMPL '{}/functions/face_detector.py'; """.format( EvaDB_INSTALLATION_DIR ) -Mvit_udf_query = """CREATE UDF IF NOT EXISTS MVITActionRecognition +Mvit_function_query = """CREATE FUNCTION IF NOT EXISTS MVITActionRecognition INPUT (Frame_Array NDARRAY UINT8(3, 16, 224, 224)) OUTPUT (labels NDARRAY STR(ANYDIM)) TYPE Classification - IMPL '{}/udfs/mvit_action_recognition.py'; + IMPL '{}/functions/mvit_action_recognition.py'; """.format( EvaDB_INSTALLATION_DIR ) -Asl_udf_query = """CREATE UDF IF NOT EXISTS ASLActionRecognition +Asl_function_query = """CREATE FUNCTION IF NOT EXISTS ASLActionRecognition INPUT (Frame_Array NDARRAY UINT8(3, 16, 224, 224)) OUTPUT (labels NDARRAY STR(ANYDIM)) TYPE Classification - IMPL '{}/udfs/asl_action_recognition.py'; + IMPL '{}/functions/asl_action_recognition.py'; """.format( EvaDB_INSTALLATION_DIR ) -norfair_obj_tracker_query = """CREATE UDF IF NOT EXISTS NorFairTracker - IMPL '{}/udfs/trackers/nor_fair.py'; +norfair_obj_tracker_query = """CREATE FUNCTION IF NOT EXISTS NorFairTracker + IMPL '{}/functions/trackers/nor_fair.py'; """.format( EvaDB_INSTALLATION_DIR ) -Sift_udf_query = """CREATE UDF IF NOT EXISTS SiftFeatureExtractor - IMPL '{}/udfs/sift_feature_extractor.py'; +Sift_function_query = """CREATE FUNCTION IF NOT EXISTS SiftFeatureExtractor + IMPL '{}/functions/sift_feature_extractor.py'; """.format( EvaDB_INSTALLATION_DIR ) -Text_feat_udf_query = """CREATE UDF IF NOT EXISTS SentenceFeatureExtractor - IMPL '{}/udfs/sentence_feature_extractor.py'; +Text_feat_function_query = """CREATE FUNCTION IF NOT EXISTS SentenceFeatureExtractor + IMPL '{}/functions/sentence_feature_extractor.py'; """.format( EvaDB_INSTALLATION_DIR ) -mnistcnn_udf_query = """CREATE UDF IF NOT EXISTS MnistImageClassifier +mnistcnn_function_query = """CREATE FUNCTION IF NOT EXISTS MnistImageClassifier INPUT (data NDARRAY (3, 28, 28)) OUTPUT (label TEXT(2)) TYPE Classification - IMPL '{}/udfs/mnist_image_classifier.py'; + IMPL '{}/functions/mnist_image_classifier.py'; """.format( EvaDB_INSTALLATION_DIR ) -chatgpt_udf_query = """CREATE UDF IF NOT EXISTS ChatGPT - IMPL '{}/udfs/chatgpt.py'; +chatgpt_function_query = """CREATE FUNCTION IF NOT EXISTS ChatGPT + IMPL '{}/functions/chatgpt.py'; """.format( EvaDB_INSTALLATION_DIR ) -yolo8n_query = """CREATE UDF IF NOT EXISTS Yolo +yolo8n_query = """CREATE FUNCTION IF NOT EXISTS Yolo TYPE ultralytics - 'model' 'yolov8n.pt'; + MODEL 'yolov8n.pt'; """ -def init_builtin_udfs(db: EvaDBDatabase, mode: str = "debug") -> None: - """Load the built-in UDFs into the system during system bootstrapping. +def init_builtin_functions(db: EvaDBDatabase, mode: str = "debug") -> None: + """Load the built-in functions into the system during system bootstrapping. - The function loads a set of pre-defined UDF queries based on the `mode` argument. - In 'debug' mode, the function loads debug UDFs along with release UDFs. - In 'release' mode, only release UDFs are loaded. In addition, in 'debug' mode, + The function loads a set of pre-defined function queries based on the `mode` argument. + In 'debug' mode, the function loads debug functions along with release functions. + In 'release' mode, only release functions are loaded. In addition, in 'debug' mode, the function loads a smaller model to accelerate the test suite time. Args: - mode (str, optional): The mode for loading UDFs, either 'debug' or 'release'. + mode (str, optional): The mode for loading functions, either 'debug' or 'release'. Defaults to 'debug'. """ @@ -219,33 +219,33 @@ def init_builtin_udfs(db: EvaDBDatabase, mode: str = "debug") -> None: os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" os.environ["TOKENIZERS_PARALLELISM"] = "false" - # list of UDF queries to load + # list of function queries to load queries = [ - mnistcnn_udf_query, - Fastrcnn_udf_query, - ArrayCount_udf_query, - Crop_udf_query, - Open_udf_query, - Similarity_udf_query, + mnistcnn_function_query, + Fastrcnn_function_query, + ArrayCount_function_query, + Crop_function_query, + Open_function_query, + Similarity_function_query, norfair_obj_tracker_query, - chatgpt_udf_query, - face_detection_udf_query, - # Mvit_udf_query, - Sift_udf_query, - Yolo_udf_query, + chatgpt_function_query, + face_detection_function_query, + # Mvit_function_query, + Sift_function_query, + Yolo_function_query, ] - # if mode is 'debug', add debug UDFs + # if mode is 'debug', add debug functions if mode == "debug": queries.extend( [ - DummyObjectDetector_udf_query, - DummyMultiObjectDetector_udf_query, - DummyFeatureExtractor_udf_query, + DummyObjectDetector_function_query, + DummyMultiObjectDetector_function_query, + DummyFeatureExtractor_function_query, ] ) - # execute each query in the list of UDF queries + # execute each query in the list of function queries # ignore exceptions during the bootstrapping phase due to missing packages for query in queries: try: diff --git a/evadb/udfs/fuzzy_join.py b/evadb/functions/fuzzy_join.py similarity index 86% rename from evadb/udfs/fuzzy_join.py rename to evadb/functions/fuzzy_join.py index 5c6cd9134d..38a0de4a28 100644 --- a/evadb/udfs/fuzzy_join.py +++ b/evadb/functions/fuzzy_join.py @@ -33,13 +33,13 @@ from thefuzz import fuzz from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -class FuzzDistance(AbstractUDF): - @setup(cacheable=False, udf_type="FeatureExtraction", batchable=False) +class FuzzDistance(AbstractFunction): + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) def setup(self): pass diff --git a/evadb/udfs/gpu_compatible.py b/evadb/functions/gpu_compatible.py similarity index 100% rename from evadb/udfs/gpu_compatible.py rename to evadb/functions/gpu_compatible.py diff --git a/evadb/udfs/ludwig.py b/evadb/functions/ludwig.py similarity index 91% rename from evadb/udfs/ludwig.py rename to evadb/functions/ludwig.py index e4e58f3b06..ebc5f00e08 100644 --- a/evadb/udfs/ludwig.py +++ b/evadb/functions/ludwig.py @@ -14,11 +14,11 @@ # limitations under the License. import pandas as pd -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.utils.generic_utils import try_to_import_ludwig -class GenericLudwigModel(AbstractUDF): +class GenericLudwigModel(AbstractFunction): @property def name(self) -> str: return "GenericLudwigModel" diff --git a/evadb/udfs/mnist_image_classifier.py b/evadb/functions/mnist_image_classifier.py similarity index 95% rename from evadb/udfs/mnist_image_classifier.py rename to evadb/functions/mnist_image_classifier.py index 2849829b3f..e78b2e990a 100644 --- a/evadb/udfs/mnist_image_classifier.py +++ b/evadb/functions/mnist_image_classifier.py @@ -16,11 +16,13 @@ import pandas as pd -from evadb.udfs.abstract.pytorch_abstract_udf import PytorchAbstractClassifierUDF +from evadb.functions.abstract.pytorch_abstract_function import ( + PytorchAbstractClassifierFunction, +) from evadb.utils.generic_utils import try_to_import_torch, try_to_import_torchvision -class MnistImageClassifier(PytorchAbstractClassifierUDF): +class MnistImageClassifier(PytorchAbstractClassifierFunction): @property def name(self) -> str: return "MnistImageClassifier" diff --git a/evadb/udfs/mvit_action_recognition.py b/evadb/functions/mvit_action_recognition.py similarity index 92% rename from evadb/udfs/mvit_action_recognition.py rename to evadb/functions/mvit_action_recognition.py index 3cc6856210..af38836373 100644 --- a/evadb/udfs/mvit_action_recognition.py +++ b/evadb/functions/mvit_action_recognition.py @@ -16,11 +16,13 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.pytorch_abstract_udf import PytorchAbstractClassifierUDF +from evadb.functions.abstract.pytorch_abstract_function import ( + PytorchAbstractClassifierFunction, +) from evadb.utils.generic_utils import try_to_import_torch, try_to_import_torchvision -class MVITActionRecognition(PytorchAbstractClassifierUDF): +class MVITActionRecognition(PytorchAbstractClassifierFunction): @property def name(self) -> str: return "MVITActionRecognition" diff --git a/evadb/udfs/ndarray/__init__.py b/evadb/functions/ndarray/__init__.py similarity index 100% rename from evadb/udfs/ndarray/__init__.py rename to evadb/functions/ndarray/__init__.py diff --git a/evadb/udfs/ndarray/annotate.py b/evadb/functions/ndarray/annotate.py similarity index 87% rename from evadb/udfs/ndarray/annotate.py rename to evadb/functions/ndarray/annotate.py index 29e61fef53..1951180d08 100644 --- a/evadb/udfs/ndarray/annotate.py +++ b/evadb/functions/ndarray/annotate.py @@ -16,17 +16,17 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe from evadb.utils.generic_utils import try_to_import_cv2 color = (207, 248, 64) thickness = 4 -class Annotate(AbstractUDF): - @setup(cacheable=False, udf_type="cv2-transformation", batchable=True) +class Annotate(AbstractFunction): + @setup(cacheable=False, function_type="cv2-transformation", batchable=True) def setup(self): pass diff --git a/evadb/udfs/ndarray/array_count.py b/evadb/functions/ndarray/array_count.py similarity index 95% rename from evadb/udfs/ndarray/array_count.py rename to evadb/functions/ndarray/array_count.py index 048014af1c..c7c5361b01 100644 --- a/evadb/udfs/ndarray/array_count.py +++ b/evadb/functions/ndarray/array_count.py @@ -15,10 +15,10 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.functions.abstract.abstract_function import AbstractFunction -class ArrayCount(AbstractUDF): +class ArrayCount(AbstractFunction): @property def name(self) -> str: return "ArrayCount" diff --git a/evadb/udfs/ndarray/crop.py b/evadb/functions/ndarray/crop.py similarity index 93% rename from evadb/udfs/ndarray/crop.py rename to evadb/functions/ndarray/crop.py index c5d1a9b260..6b5b7b93c6 100644 --- a/evadb/udfs/ndarray/crop.py +++ b/evadb/functions/ndarray/crop.py @@ -15,10 +15,10 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.functions.abstract.abstract_function import AbstractFunction -class Crop(AbstractUDF): +class Crop(AbstractFunction): def setup(self): pass diff --git a/evadb/udfs/ndarray/fuzzy_join.py b/evadb/functions/ndarray/fuzzy_join.py similarity index 91% rename from evadb/udfs/ndarray/fuzzy_join.py rename to evadb/functions/ndarray/fuzzy_join.py index 1e3461d650..485d76df63 100644 --- a/evadb/udfs/ndarray/fuzzy_join.py +++ b/evadb/functions/ndarray/fuzzy_join.py @@ -15,10 +15,10 @@ import pandas as pd from thefuzz import fuzz -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.functions.abstract.abstract_function import AbstractFunction -class FuzzDistance(AbstractUDF): +class FuzzDistance(AbstractFunction): def setup(self): pass diff --git a/evadb/udfs/ndarray/gaussian_blur.py b/evadb/functions/ndarray/gaussian_blur.py similarity index 85% rename from evadb/udfs/ndarray/gaussian_blur.py rename to evadb/functions/ndarray/gaussian_blur.py index 2f3f33e60e..c3ff7136d9 100644 --- a/evadb/udfs/ndarray/gaussian_blur.py +++ b/evadb/functions/ndarray/gaussian_blur.py @@ -16,14 +16,14 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe from evadb.utils.generic_utils import try_to_import_cv2 -class GaussianBlur(AbstractUDF): - @setup(cacheable=False, udf_type="cv2-transformation", batchable=True) +class GaussianBlur(AbstractFunction): + @setup(cacheable=False, function_type="cv2-transformation", batchable=True) def setup(self): pass diff --git a/evadb/udfs/ndarray/horizontal_flip.py b/evadb/functions/ndarray/horizontal_flip.py similarity index 85% rename from evadb/udfs/ndarray/horizontal_flip.py rename to evadb/functions/ndarray/horizontal_flip.py index 5f4a843f23..6918ede1f7 100644 --- a/evadb/udfs/ndarray/horizontal_flip.py +++ b/evadb/functions/ndarray/horizontal_flip.py @@ -16,14 +16,14 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe from evadb.utils.generic_utils import try_to_import_cv2 -class HorizontalFlip(AbstractUDF): - @setup(cacheable=False, udf_type="cv2-transformation", batchable=True) +class HorizontalFlip(AbstractFunction): + @setup(cacheable=False, function_type="cv2-transformation", batchable=True) def setup(self): try_to_import_cv2() diff --git a/evadb/udfs/ndarray/open.py b/evadb/functions/ndarray/open.py similarity index 93% rename from evadb/udfs/ndarray/open.py rename to evadb/functions/ndarray/open.py index c539851bd6..2585d6ae43 100644 --- a/evadb/udfs/ndarray/open.py +++ b/evadb/functions/ndarray/open.py @@ -15,11 +15,11 @@ import numpy as np import pandas as pd -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.utils.generic_utils import try_to_import_cv2 -class Open(AbstractUDF): +class Open(AbstractFunction): def setup(self): # cache data to avoid expensive open files on disk self._data_cache = dict() diff --git a/evadb/udfs/ndarray/similarity.py b/evadb/functions/ndarray/similarity.py similarity index 94% rename from evadb/udfs/ndarray/similarity.py rename to evadb/functions/ndarray/similarity.py index 4ef35820f4..fe7e51e259 100644 --- a/evadb/udfs/ndarray/similarity.py +++ b/evadb/functions/ndarray/similarity.py @@ -14,11 +14,11 @@ # limitations under the License. import pandas as pd -from evadb.udfs.abstract.abstract_udf import AbstractUDF +from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.utils.generic_utils import try_to_import_faiss -class Similarity(AbstractUDF): +class Similarity(AbstractFunction): def _get_distance(self, numpy_distance): return numpy_distance[0][0] diff --git a/evadb/udfs/ndarray/to_grayscale.py b/evadb/functions/ndarray/to_grayscale.py similarity index 85% rename from evadb/udfs/ndarray/to_grayscale.py rename to evadb/functions/ndarray/to_grayscale.py index f2c057c33e..171c5d9d2a 100644 --- a/evadb/udfs/ndarray/to_grayscale.py +++ b/evadb/functions/ndarray/to_grayscale.py @@ -16,14 +16,14 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe from evadb.utils.generic_utils import try_to_import_cv2 -class ToGrayscale(AbstractUDF): - @setup(cacheable=False, udf_type="cv2-transformation", batchable=True) +class ToGrayscale(AbstractFunction): + @setup(cacheable=False, function_type="cv2-transformation", batchable=True) def setup(self): try_to_import_cv2() diff --git a/evadb/udfs/ndarray/vertical_flip.py b/evadb/functions/ndarray/vertical_flip.py similarity index 85% rename from evadb/udfs/ndarray/vertical_flip.py rename to evadb/functions/ndarray/vertical_flip.py index 376221dcd0..f89272ab59 100644 --- a/evadb/udfs/ndarray/vertical_flip.py +++ b/evadb/functions/ndarray/vertical_flip.py @@ -16,14 +16,14 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe from evadb.utils.generic_utils import try_to_import_cv2 -class VerticalFlip(AbstractUDF): - @setup(cacheable=False, udf_type="cv2-transformation", batchable=True) +class VerticalFlip(AbstractFunction): + @setup(cacheable=False, function_type="cv2-transformation", batchable=True) def setup(self): try_to_import_cv2() diff --git a/evadb/udfs/saliency_feature_extractor.py b/evadb/functions/saliency_feature_extractor.py similarity index 84% rename from evadb/udfs/saliency_feature_extractor.py rename to evadb/functions/saliency_feature_extractor.py index 6f42397cd3..67a00764ea 100644 --- a/evadb/udfs/saliency_feature_extractor.py +++ b/evadb/functions/saliency_feature_extractor.py @@ -20,14 +20,14 @@ from torchvision.transforms import Compose, Resize, ToTensor from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.gpu_compatible import GPUCompatible -class SaliencyFeatureExtractor(AbstractUDF, GPUCompatible): - @setup(cacheable=False, udf_type="FeatureExtraction", batchable=False) +class SaliencyFeatureExtractor(AbstractFunction, GPUCompatible): + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) def setup(self): self.model = torchvision.models.resnet18(pretrained=True) self.model.eval() diff --git a/evadb/udfs/sentence_feature_extractor.py b/evadb/functions/sentence_feature_extractor.py similarity index 83% rename from evadb/udfs/sentence_feature_extractor.py rename to evadb/functions/sentence_feature_extractor.py index eb2e251826..b7af521bdc 100644 --- a/evadb/udfs/sentence_feature_extractor.py +++ b/evadb/functions/sentence_feature_extractor.py @@ -16,10 +16,10 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.gpu_compatible import GPUCompatible def try_to_import_sentence_transformers(): @@ -36,8 +36,8 @@ def try_to_import_sentence_transformers(): from sentence_transformers import SentenceTransformer # noqa: E402 -class SentenceTransformerFeatureExtractor(AbstractUDF, GPUCompatible): - @setup(cacheable=False, udf_type="FeatureExtraction", batchable=False) +class SentenceTransformerFeatureExtractor(AbstractFunction, GPUCompatible): + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) def setup(self): self.model = SentenceTransformer("all-MiniLM-L6-v2") diff --git a/evadb/udfs/sift_feature_extractor.py b/evadb/functions/sift_feature_extractor.py similarity index 86% rename from evadb/udfs/sift_feature_extractor.py rename to evadb/functions/sift_feature_extractor.py index 6f700dcb4a..cb25f982cb 100644 --- a/evadb/udfs/sift_feature_extractor.py +++ b/evadb/functions/sift_feature_extractor.py @@ -16,10 +16,10 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.gpu_compatible import GPUCompatible from evadb.utils.generic_utils import ( try_to_import_cv2, try_to_import_kornia, @@ -27,8 +27,8 @@ ) -class SiftFeatureExtractor(AbstractUDF, GPUCompatible): - @setup(cacheable=False, udf_type="FeatureExtraction", batchable=False) +class SiftFeatureExtractor(AbstractFunction, GPUCompatible): + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) def setup(self): try_to_import_kornia() import kornia diff --git a/evadb/udfs/text_filter_keyword.py b/evadb/functions/text_filter_keyword.py similarity index 84% rename from evadb/udfs/text_filter_keyword.py rename to evadb/functions/text_filter_keyword.py index 1666e08f7d..d16d003e99 100644 --- a/evadb/udfs/text_filter_keyword.py +++ b/evadb/functions/text_filter_keyword.py @@ -16,13 +16,13 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -class TextFilterKeyword(AbstractUDF): - @setup(cacheable=False, udf_type="TextProcessing", batchable=False) +class TextFilterKeyword(AbstractFunction): + @setup(cacheable=False, function_type="TextProcessing", batchable=False) def setup(self): pass diff --git a/evadb/udfs/trackers/__init__.py b/evadb/functions/trackers/__init__.py similarity index 100% rename from evadb/udfs/trackers/__init__.py rename to evadb/functions/trackers/__init__.py diff --git a/evadb/udfs/trackers/nor_fair.py b/evadb/functions/trackers/nor_fair.py similarity index 93% rename from evadb/udfs/trackers/nor_fair.py rename to evadb/functions/trackers/nor_fair.py index 75dc1493c1..0468a3c997 100644 --- a/evadb/udfs/trackers/nor_fair.py +++ b/evadb/functions/trackers/nor_fair.py @@ -14,14 +14,16 @@ # limitations under the License. import numpy as np -from evadb.udfs.abstract.tracker_abstract_udf import EvaDBTrackerAbstractUDF +from evadb.functions.abstract.tracker_abstract_function import ( + EvaDBTrackerAbstractFunction, +) from evadb.utils.generic_utils import try_to_import_norfair from evadb.utils.math_utils import get_centroid DISTANCE_THRESHOLD_CENTROID: int = 30 -class NorFairTracker(EvaDBTrackerAbstractUDF): +class NorFairTracker(EvaDBTrackerAbstractFunction): @property def name(self) -> str: return "NorFairTracker" diff --git a/evadb/udfs/yolo_object_detector.py b/evadb/functions/yolo_object_detector.py similarity index 89% rename from evadb/udfs/yolo_object_detector.py rename to evadb/functions/yolo_object_detector.py index 45d7296265..f4bc48fbf7 100644 --- a/evadb/udfs/yolo_object_detector.py +++ b/evadb/functions/yolo_object_detector.py @@ -16,14 +16,14 @@ import pandas as pd from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe -from evadb.udfs.gpu_compatible import GPUCompatible +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.gpu_compatible import GPUCompatible from evadb.utils.generic_utils import try_to_import_ultralytics -class Yolo(AbstractUDF, GPUCompatible): +class Yolo(AbstractFunction, GPUCompatible): """ Arguments: threshold (float): Threshold for classifier confidence score @@ -33,7 +33,7 @@ class Yolo(AbstractUDF, GPUCompatible): def name(self) -> str: return "yolo" - @setup(cacheable=True, udf_type="object_detection", batchable=True) + @setup(cacheable=True, function_type="object_detection", batchable=True) def setup(self, model: str, threshold=0.3): try_to_import_ultralytics() from ultralytics import YOLO diff --git a/evadb/interfaces/relational/db.py b/evadb/interfaces/relational/db.py index 07c57602e7..1a00172fa5 100644 --- a/evadb/interfaces/relational/db.py +++ b/evadb/interfaces/relational/db.py @@ -19,6 +19,7 @@ from evadb.configuration.constants import EvaDB_DATABASE_DIR from evadb.database import EvaDBDatabase, init_evadb_instance from evadb.expression.tuple_value_expression import TupleValueExpression +from evadb.functions.function_bootstrap_queries import init_builtin_functions from evadb.interfaces.relational.relation import EvaDBQuery from evadb.interfaces.relational.utils import try_binding from evadb.models.server.response import Response @@ -26,12 +27,12 @@ from evadb.parser.alias import Alias from evadb.parser.select_statement import SelectStatement from evadb.parser.utils import ( + parse_create_function, parse_create_table, - parse_create_udf, parse_create_vector_index, + parse_drop_function, parse_drop_index, parse_drop_table, - parse_drop_udf, parse_explain, parse_insert, parse_load, @@ -41,7 +42,6 @@ parse_table_clause, ) from evadb.server.command_handler import execute_statement -from evadb.udfs.udf_bootstrap_queries import init_builtin_udfs from evadb.utils.generic_utils import find_nearest_word, is_ray_enabled_and_installed from evadb.utils.logging_manager import logger @@ -321,25 +321,25 @@ def drop_table(self, table_name: str, if_exists: bool = True) -> "EvaDBQuery": stmt = parse_drop_table(table_name, if_exists) return EvaDBQuery(self._evadb, stmt) - def drop_function(self, udf_name: str, if_exists: bool = True) -> "EvaDBQuery": + def drop_function(self, function_name: str, if_exists: bool = True) -> "EvaDBQuery": """ - Drop a udf in the database. + Drop a function in the database. Args: - udf_name (str): Name of the udf to be dropped. - if_exists (bool): If True, do not raise an error if the UDF does not already exist. If False, raise an error. + function_name (str): Name of the function to be dropped. + if_exists (bool): If True, do not raise an error if the function does not already exist. If False, raise an error. Returns: - EvaDBQuery: The EvaDBQuery object representing the DROP UDF. + EvaDBQuery: The EvaDBQuery object representing the DROP FUNCTION. Examples: - Drop UDF 'ObjectDetector' + Drop FUNCTION 'ObjectDetector' >>> cursor.drop_function("ObjectDetector", if_exists = True) 0 - 0 UDF Successfully dropped: ObjectDetector + 0 Function Successfully dropped: ObjectDetector """ - stmt = parse_drop_udf(udf_name, if_exists) + stmt = parse_drop_function(function_name, if_exists) return EvaDBQuery(self._evadb, stmt) def drop_index(self, index_name: str, if_exists: bool = True) -> "EvaDBQuery": @@ -363,48 +363,50 @@ def drop_index(self, index_name: str, if_exists: bool = True) -> "EvaDBQuery": def create_function( self, - udf_name: str, + function_name: str, if_not_exists: bool = True, impl_path: str = None, type: str = None, **kwargs, ) -> "EvaDBQuery": """ - Create a udf in the database. + Create a function in the database. Args: - udf_name (str): Name of the udf to be created. - if_not_exists (bool): If True, do not raise an error if the UDF already exist. If False, raise an error. - impl_path (str): Path string to udf's implementation. - type (str): Type of the udf (e.g. HuggingFace). - **kwargs: Additional keyword arguments for configuring the create udf operation. + function_name (str): Name of the function to be created. + if_not_exists (bool): If True, do not raise an error if the function already exist. If False, raise an error. + impl_path (str): Path string to function's implementation. + type (str): Type of the function (e.g. HuggingFace). + **kwargs: Additional keyword arguments for configuring the create function operation. Returns: - EvaDBQuery: The EvaDBQuery object representing the UDF created. + EvaDBQuery: The EvaDBQuery object representing the function created. Examples: >>> cursor.create_function("MnistImageClassifier", if_exists = True, 'mnist_image_classifier.py') 0 - 0 UDF Successfully created: MnistImageClassifier + 0 Function Successfully created: MnistImageClassifier """ - stmt = parse_create_udf(udf_name, if_not_exists, impl_path, type, **kwargs) + stmt = parse_create_function( + function_name, if_not_exists, impl_path, type, **kwargs + ) return EvaDBQuery(self._evadb, stmt) def create_table( self, table_name: str, if_not_exists: bool = True, columns: str = None, **kwargs ) -> "EvaDBQuery": """ - Create a udf in the database. + Create a function in the database. Args: - udf_name (str): Name of the udf to be created. - if_not_exists (bool): If True, do not raise an error if the UDF already exist. If False, raise an error. - impl_path (str): Path string to udf's implementation. - type (str): Type of the udf (e.g. HuggingFace). - **kwargs: Additional keyword arguments for configuring the create udf operation. + function_name (str): Name of the function to be created. + if_not_exists (bool): If True, do not raise an error if the function already exist. If False, raise an error. + impl_path (str): Path string to function's implementation. + type (str): Type of the function (e.g. HuggingFace). + **kwargs: Additional keyword arguments for configuring the create function operation. Returns: - EvaDBQuery: The EvaDBQuery object representing the UDF created. + EvaDBQuery: The EvaDBQuery object representing the function created. Examples: >>> cursor.create_table("MyCSV", if_exists = True, columns=\"\"\" @@ -433,7 +435,7 @@ def query(self, sql_query: str) -> EvaDBQuery: EvaDBQuery: The EvaDBQuery object. Examples: - >>> cursor.query("DROP UDF IF EXISTS SentenceFeatureExtractor;") + >>> cursor.query("DROP FUNCTION IF EXISTS SentenceFeatureExtractor;") >>> cursor.query('SELECT * FROM sample_table;').df() col1 col2 0 1 2 @@ -579,7 +581,7 @@ def connect( # host and port parameters are irrelevant. Additionally, for the EvaDBConnection, the # reader and writer parameters are not relevant in the serverless approach. evadb = init_evadb_instance(evadb_dir, custom_db_uri=sql_backend) - init_builtin_udfs(evadb, mode="release") + init_builtin_functions(evadb, mode="release") return EvaDBConnection(evadb, None, None) diff --git a/evadb/models/storage/batch.py b/evadb/models/storage/batch.py index 36e8b3c19a..a29d9ab69b 100644 --- a/evadb/models/storage/batch.py +++ b/evadb/models/storage/batch.py @@ -266,7 +266,7 @@ def merge_column_wise(cls, batches: List[Batch], auto_renaming=False) -> Batch: for i, frame_index in enumerate(frames_index): assert ( frame_index == frames_index[i - 1] - ), "Merging of DataFrames with unmatched indice can cause undefined behavior" + ), "Merging of DataFrames with unmatched indices can cause undefined behavior" new_frames = pd.concat(frames, axis=1, copy=False, ignore_index=False).fillna( method="ffill" diff --git a/evadb/optimizer/operators.py b/evadb/optimizer/operators.py index edfa2d100f..8154bc402f 100644 --- a/evadb/optimizer/operators.py +++ b/evadb/optimizer/operators.py @@ -19,9 +19,9 @@ from evadb.catalog.catalog_type import VectorStoreType from evadb.catalog.models.column_catalog import ColumnCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry from evadb.catalog.models.table_catalog import TableCatalogEntry -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry -from evadb.catalog.models.udf_metadata_catalog import UdfMetadataCatalogEntry from evadb.expression.abstract_expression import AbstractExpression from evadb.expression.constant_value_expression import ConstantValueExpression from evadb.expression.function_expression import FunctionExpression @@ -46,7 +46,7 @@ class OperatorType(IntEnum): LOGICALCREATE = auto() LOGICALRENAME = auto() LOGICAL_DROP_OBJECT = auto() - LOGICALCREATEUDF = auto() + LOGICALCREATEFUNCTION = auto() LOGICALLOADDATA = auto() LOGICALQUERYDERIVEDGET = auto() LOGICALUNION = auto() @@ -634,46 +634,46 @@ def __hash__(self) -> int: return hash((super().__hash__(), self._new_name, self._old_table_ref)) -class LogicalCreateUDF(Operator): +class LogicalCreateFunction(Operator): """ - Logical node for create udf operations + Logical node for create function operations Attributes: name: str - udf_name provided by the user required + function_name provided by the user required if_not_exists: bool - if true should throw an error if udf with same name exists + if true should throw an error if function with same name exists else will replace the existing - inputs: List[UdfIOCatalogEntry] - udf inputs, annotated list similar to table columns - outputs: List[UdfIOCatalogEntry] - udf outputs, annotated list similar to table columns + inputs: List[FunctionIOCatalogEntry] + function inputs, annotated list similar to table columns + outputs: List[FunctionIOCatalogEntry] + function outputs, annotated list similar to table columns impl_path: Path - file path which holds the implementation of the udf. - This file should be placed in the UDF directory and - the path provided should be relative to the UDF dir. - udf_type: str - udf type. it ca be object detection, classification etc. + file path which holds the implementation of the function. + This file should be placed in the function directory and + the path provided should be relative to the function dir. + function_type: str + function type. it ca be object detection, classification etc. """ def __init__( self, name: str, if_not_exists: bool, - inputs: List[UdfIOCatalogEntry], - outputs: List[UdfIOCatalogEntry], + inputs: List[FunctionIOCatalogEntry], + outputs: List[FunctionIOCatalogEntry], impl_path: Path, - udf_type: str = None, - metadata: List[UdfMetadataCatalogEntry] = None, + function_type: str = None, + metadata: List[FunctionMetadataCatalogEntry] = None, children: List = None, ): - super().__init__(OperatorType.LOGICALCREATEUDF, children) + super().__init__(OperatorType.LOGICALCREATEFUNCTION, children) self._name = name self._if_not_exists = if_not_exists self._inputs = inputs self._outputs = outputs self._impl_path = impl_path - self._udf_type = udf_type + self._function_type = function_type self._metadata = metadata @property @@ -697,8 +697,8 @@ def impl_path(self): return self._impl_path @property - def udf_type(self): - return self._udf_type + def function_type(self): + return self._function_type @property def metadata(self): @@ -706,7 +706,7 @@ def metadata(self): def __eq__(self, other): is_subtree_equal = super().__eq__(other) - if not isinstance(other, LogicalCreateUDF): + if not isinstance(other, LogicalCreateFunction): return False return ( is_subtree_equal @@ -714,7 +714,7 @@ def __eq__(self, other): and self.if_not_exists == other.if_not_exists and self.inputs == other.inputs and self.outputs == other.outputs - and self.udf_type == other.udf_type + and self.function_type == other.function_type and self.impl_path == other.impl_path and self.metadata == other.metadata ) @@ -727,7 +727,7 @@ def __hash__(self) -> int: self.if_not_exists, tuple(self.inputs), tuple(self.outputs), - self.udf_type, + self.function_type, self.impl_path, tuple(self.metadata), ) @@ -741,9 +741,9 @@ class LogicalDropObject(Operator): Attributes: object_type: ObjectType name: str - UDF name provided by the user + Function name provided by the user if_exists: bool - if false, throws an error when no UDF with name exists + if false, throws an error when no function with name exists else logs a warning """ @@ -1073,7 +1073,7 @@ def __init__( table_ref: TableRef, col_list: List[ColumnDefinition], vector_store_type: VectorStoreType, - udf_func: FunctionExpression = None, + function: FunctionExpression = None, children: List = None, ): super().__init__(OperatorType.LOGICALCREATEINDEX, children) @@ -1081,7 +1081,7 @@ def __init__( self._table_ref = table_ref self._col_list = col_list self._vector_store_type = vector_store_type - self._udf_func = udf_func + self._function = function @property def name(self): @@ -1100,8 +1100,8 @@ def vector_store_type(self): return self._vector_store_type @property - def udf_func(self): - return self._udf_func + def function(self): + return self._function def __eq__(self, other): is_subtree_equal = super().__eq__(other) @@ -1113,7 +1113,7 @@ def __eq__(self, other): and self.table_ref == other.table_ref and self.col_list == other.col_list and self.vector_store_type == other.vector_store_type - and self.udf_func == other.udf_func + and self.function == other.function ) def __hash__(self) -> int: @@ -1124,7 +1124,7 @@ def __hash__(self) -> int: self.table_ref, tuple(self.col_list), self.vector_store_type, - self.udf_func, + self.function, ) ) diff --git a/evadb/optimizer/optimizer_utils.py b/evadb/optimizer/optimizer_utils.py index 36fcc018f9..9bcdeac41d 100644 --- a/evadb/optimizer/optimizer_utils.py +++ b/evadb/optimizer/optimizer_utils.py @@ -20,9 +20,9 @@ from evadb.catalog.catalog_utils import get_table_primary_columns from evadb.catalog.models.column_catalog import ColumnCatalogEntry -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry -from evadb.catalog.models.udf_metadata_catalog import UdfMetadataCatalogEntry -from evadb.constants import CACHEABLE_UDFS, DEFAULT_FUNCTION_EXPRESSION_COST +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry +from evadb.constants import CACHEABLE_FUNCTIONS, DEFAULT_FUNCTION_EXPRESSION_COST from evadb.expression.abstract_expression import AbstractExpression, ExpressionType from evadb.expression.expression_utils import ( conjunction_list_to_expression_tree, @@ -41,8 +41,8 @@ from evadb.utils.kv_cache import DiskKVCache -def column_definition_to_udf_io(col_list: List[ColumnDefinition], is_input: bool): - """Create the UdfIOCatalogEntry object for each column definition provided +def column_definition_to_function_io(col_list: List[ColumnDefinition], is_input: bool): + """Create the FunctionIOCatalogEntry object for each column definition provided Arguments: col_list(List[ColumnDefinition]): parsed input/output definitions @@ -53,9 +53,9 @@ def column_definition_to_udf_io(col_list: List[ColumnDefinition], is_input: bool result_list = [] for col in col_list: - assert col is not None, "Empty column definition while creating udf io" + assert col is not None, "Empty column definition while creating function io" result_list.append( - UdfIOCatalogEntry( + FunctionIOCatalogEntry( col.name, col.type, col.cci.nullable, @@ -67,8 +67,8 @@ def column_definition_to_udf_io(col_list: List[ColumnDefinition], is_input: bool return result_list -def metadata_definition_to_udf_metadata(metadata_list: List[Tuple[str, str]]): - """Create the UdfMetadataCatalogEntry object for each metadata definition provided +def metadata_definition_to_function_metadata(metadata_list: List[Tuple[str, str]]): + """Create the FunctionMetadataCatalogEntry object for each metadata definition provided Arguments: col_list(List[Tuple[str, str]]): parsed metadata definitions @@ -76,7 +76,7 @@ def metadata_definition_to_udf_metadata(metadata_list: List[Tuple[str, str]]): result_list = [] for metadata in metadata_list: result_list.append( - UdfMetadataCatalogEntry( + FunctionMetadataCatalogEntry( metadata[0], metadata[1], ) @@ -238,9 +238,9 @@ def enable_cache_init( catalog = context.db.catalog() name = func_expr.signature() - cache_entry = catalog.get_udf_cache_catalog_entry_by_name(name) + cache_entry = catalog.get_function_cache_catalog_entry_by_name(name) if not cache_entry: - cache_entry = catalog.insert_udf_cache_catalog_entry(func_expr) + cache_entry = catalog.insert_function_cache_catalog_entry(func_expr) cache = FunctionExpressionCache( key=tuple(optimized_key), store=DiskKVCache(cache_entry.cache_path) @@ -281,7 +281,7 @@ def enable_cache_on_expression_tree( def check_expr_validity_for_cache(expr: FunctionExpression): return ( - expr.name in CACHEABLE_UDFS + expr.name in CACHEABLE_FUNCTIONS and not expr.has_cache() and len(expr.children) <= 1 and isinstance(expr.children[0], TupleValueExpression) @@ -309,7 +309,9 @@ def get_expression_execution_cost( total_cost = 0 # iterate over all the function expression and accumulate the cost for child_expr in expr.find_all(FunctionExpression): - cost_entry = context.db.catalog().get_udf_cost_catalog_entry(child_expr.name) + cost_entry = context.db.catalog().get_function_cost_catalog_entry( + child_expr.name + ) if cost_entry: total_cost += cost_entry.cost else: diff --git a/evadb/optimizer/rules/rules.py b/evadb/optimizer/rules/rules.py index 33c8905db5..ce287326db 100644 --- a/evadb/optimizer/rules/rules.py +++ b/evadb/optimizer/rules/rules.py @@ -18,7 +18,7 @@ from evadb.catalog.catalog_type import TableType from evadb.catalog.catalog_utils import is_video_table -from evadb.constants import CACHEABLE_UDFS +from evadb.constants import CACHEABLE_FUNCTIONS from evadb.executor.execution_context import Context from evadb.expression.expression_utils import ( conjunction_list_to_expression_tree, @@ -55,8 +55,8 @@ Dummy, LogicalApplyAndMerge, LogicalCreate, + LogicalCreateFunction, LogicalCreateIndex, - LogicalCreateUDF, LogicalDelete, LogicalDropObject, LogicalExchange, @@ -81,9 +81,9 @@ Operator, OperatorType, ) +from evadb.plan_nodes.create_function_plan import CreateFunctionPlan from evadb.plan_nodes.create_index_plan import CreateIndexPlan from evadb.plan_nodes.create_plan import CreatePlan -from evadb.plan_nodes.create_udf_plan import CreateUDFPlan from evadb.plan_nodes.delete_plan import DeletePlan from evadb.plan_nodes.drop_object_plan import DropObjectPlan from evadb.plan_nodes.function_scan_plan import FunctionScanPlan @@ -262,8 +262,8 @@ def promise(self): def check(self, before: LogicalApplyAndMerge, context: OptimizerContext): expr = before.func_expr # already cache enabled - # replace the cacheable condition once we have the property supported as part of the UDF itself. - if expr.has_cache() or expr.name not in CACHEABLE_UDFS: + # replace the cacheable condition once we have the property supported as part of the function itself. + if expr.has_cache() or expr.name not in CACHEABLE_FUNCTIONS: return False # we do not support caching function expression instances with multiple arguments or nested function expressions if len(expr.children) > 1 or not isinstance( @@ -348,7 +348,7 @@ class XformLateralJoinToLinearFlow(Rule): eliminate the join node and make the inner node the parent of the outer node. This produces a linear data flow path. Because this scenario is common in our system, we chose to explicitly convert it to a linear flow, which simplifies the - implementation of other optimizations such as UDF reuse and parallelized plans by + implementation of other optimizations such as function reuse and parallelized plans by removing the join.""" def __init__(self): @@ -559,19 +559,19 @@ def _exists_predicate(opr): while not isinstance(tv_expr, TupleValueExpression): tv_expr = tv_expr.children[0] - # Get column catalog entry and udf_signature. + # Get column catalog entry and function_signature. column_catalog_entry = tv_expr.col_object - udf_signature = ( + function_signature = ( None if isinstance(base_func_expr, TupleValueExpression) else base_func_expr.signature() ) # Get index catalog. Check if an index exists for matching - # udf signature and table columns. + # function signature and table columns. index_catalog_entry = ( - catalog_manager().get_index_catalog_entry_by_column_and_udf_signature( - column_catalog_entry, udf_signature + catalog_manager().get_index_catalog_entry_by_column_and_function_signature( + column_catalog_entry, function_signature ) ) if not index_catalog_entry: @@ -639,7 +639,7 @@ def promise(self): return Promise.REORDER_PREDICATES def check(self, before: LogicalFilter, context: OptimizerContext): - # there exists at least one Function Expression + # there exists at least one function Expression return len(list(before.predicate.find_all(FunctionExpression))) > 0 def apply(self, before: LogicalFilter, context: OptimizerContext): @@ -741,50 +741,52 @@ def apply(self, before: LogicalRename, context: OptimizerContext): yield after -class LogicalCreateUDFToPhysical(Rule): +class LogicalCreateFunctionToPhysical(Rule): def __init__(self): - pattern = Pattern(OperatorType.LOGICALCREATEUDF) - super().__init__(RuleType.LOGICAL_CREATE_UDF_TO_PHYSICAL, pattern) + pattern = Pattern(OperatorType.LOGICALCREATEFUNCTION) + super().__init__(RuleType.LOGICAL_CREATE_FUNCTION_TO_PHYSICAL, pattern) def promise(self): - return Promise.LOGICAL_CREATE_UDF_TO_PHYSICAL + return Promise.LOGICAL_CREATE_FUNCTION_TO_PHYSICAL def check(self, before: Operator, context: OptimizerContext): return True - def apply(self, before: LogicalCreateUDF, context: OptimizerContext): - after = CreateUDFPlan( + def apply(self, before: LogicalCreateFunction, context: OptimizerContext): + after = CreateFunctionPlan( before.name, before.if_not_exists, before.inputs, before.outputs, before.impl_path, - before.udf_type, + before.function_type, before.metadata, ) yield after -class LogicalCreateUDFFromSelectToPhysical(Rule): +class LogicalCreateFunctionFromSelectToPhysical(Rule): def __init__(self): - pattern = Pattern(OperatorType.LOGICALCREATEUDF) + pattern = Pattern(OperatorType.LOGICALCREATEFUNCTION) pattern.append_child(Pattern(OperatorType.DUMMY)) - super().__init__(RuleType.LOGICAL_CREATE_UDF_FROM_SELECT_TO_PHYSICAL, pattern) + super().__init__( + RuleType.LOGICAL_CREATE_FUNCTION_FROM_SELECT_TO_PHYSICAL, pattern + ) def promise(self): - return Promise.LOGICAL_CREATE_UDF_FROM_SELECT_TO_PHYSICAL + return Promise.LOGICAL_CREATE_FUNCTION_FROM_SELECT_TO_PHYSICAL def check(self, before: Operator, context: OptimizerContext): return True - def apply(self, before: LogicalCreateUDF, context: OptimizerContext): - after = CreateUDFPlan( + def apply(self, before: LogicalCreateFunction, context: OptimizerContext): + after = CreateFunctionPlan( before.name, before.if_not_exists, before.inputs, before.outputs, before.impl_path, - before.udf_type, + before.function_type, before.metadata, ) for child in before.children: @@ -809,7 +811,7 @@ def apply(self, before: LogicalCreateIndex, context: OptimizerContext): before.table_ref, before.col_list, before.vector_store_type, - before.udf_func, + before.function, ) yield after @@ -1332,7 +1334,7 @@ def check(self, before: LogicalProject, context: OptimizerContext): def apply(self, before: LogicalProject, context: OptimizerContext): project_plan = ProjectPlan(before.target_list) - # Check whether the projection contains a UDF + # Check whether the projection contains a Function if before.target_list is None or not any( [isinstance(expr, FunctionExpression) for expr in before.target_list] ): diff --git a/evadb/optimizer/rules/rules_base.py b/evadb/optimizer/rules/rules_base.py index 20faea20d9..c8b49f5804 100644 --- a/evadb/optimizer/rules/rules_base.py +++ b/evadb/optimizer/rules/rules_base.py @@ -67,8 +67,8 @@ class RuleType(Flag): LOGICAL_CREATE_FROM_SELECT_TO_PHYSICAL = auto() LOGICAL_RENAME_TO_PHYSICAL = auto() LOGICAL_DROP_OBJECT_TO_PHYSICAL = auto() - LOGICAL_CREATE_UDF_TO_PHYSICAL = auto() - LOGICAL_CREATE_UDF_FROM_SELECT_TO_PHYSICAL = auto() + LOGICAL_CREATE_FUNCTION_TO_PHYSICAL = auto() + LOGICAL_CREATE_FUNCTION_FROM_SELECT_TO_PHYSICAL = auto() LOGICAL_GET_TO_SEQSCAN = auto() LOGICAL_SAMPLE_TO_UNIFORMSAMPLE = auto() LOGICAL_DERIVED_GET_TO_PHYSICAL = auto() @@ -108,8 +108,8 @@ class Promise(IntEnum): LOGICAL_LOAD_TO_PHYSICAL = auto() LOGICAL_CREATE_TO_PHYSICAL = auto() LOGICAL_CREATE_FROM_SELECT_TO_PHYSICAL = auto() - LOGICAL_CREATE_UDF_TO_PHYSICAL = auto() - LOGICAL_CREATE_UDF_FROM_SELECT_TO_PHYSICAL = auto() + LOGICAL_CREATE_FUNCTION_TO_PHYSICAL = auto() + LOGICAL_CREATE_FUNCTION_FROM_SELECT_TO_PHYSICAL = auto() LOGICAL_SAMPLE_TO_UNIFORMSAMPLE = auto() LOGICAL_GET_TO_SEQSCAN = auto() LOGICAL_DERIVED_GET_TO_PHYSICAL = auto() diff --git a/evadb/optimizer/rules/rules_manager.py b/evadb/optimizer/rules/rules_manager.py index 0514b27872..7969f40273 100644 --- a/evadb/optimizer/rules/rules_manager.py +++ b/evadb/optimizer/rules/rules_manager.py @@ -28,10 +28,10 @@ LogicalApplyAndMergeToPhysical, LogicalApplyAndMergeToRayPhysical, LogicalCreateFromSelectToPhysical, + LogicalCreateFunctionFromSelectToPhysical, + LogicalCreateFunctionToPhysical, LogicalCreateIndexToVectorIndex, LogicalCreateToPhysical, - LogicalCreateUDFFromSelectToPhysical, - LogicalCreateUDFToPhysical, LogicalDeleteToPhysical, LogicalDerivedGetToPhysical, LogicalDropObjectToPhysical, @@ -93,8 +93,8 @@ def __init__(self, config: ConfigurationManager): LogicalCreateToPhysical(), LogicalCreateFromSelectToPhysical(), LogicalRenameToPhysical(), - LogicalCreateUDFToPhysical(), - LogicalCreateUDFFromSelectToPhysical(), + LogicalCreateFunctionToPhysical(), + LogicalCreateFunctionFromSelectToPhysical(), LogicalDropObjectToPhysical(), LogicalInsertToPhysical(), LogicalDeleteToPhysical(), diff --git a/evadb/optimizer/statement_to_opr_converter.py b/evadb/optimizer/statement_to_opr_converter.py index 630aea8dea..a06a3f1381 100644 --- a/evadb/optimizer/statement_to_opr_converter.py +++ b/evadb/optimizer/statement_to_opr_converter.py @@ -15,8 +15,8 @@ from evadb.expression.abstract_expression import AbstractExpression from evadb.optimizer.operators import ( LogicalCreate, + LogicalCreateFunction, LogicalCreateIndex, - LogicalCreateUDF, LogicalDelete, LogicalDropObject, LogicalExplain, @@ -38,12 +38,12 @@ LogicalUnion, ) from evadb.optimizer.optimizer_utils import ( - column_definition_to_udf_io, - metadata_definition_to_udf_metadata, + column_definition_to_function_io, + metadata_definition_to_function_metadata, ) +from evadb.parser.create_function_statement import CreateFunctionStatement from evadb.parser.create_index_statement import CreateIndexStatement from evadb.parser.create_statement import CreateTableStatement -from evadb.parser.create_udf_statement import CreateUDFStatement from evadb.parser.delete_statement import DeleteTableStatement from evadb.parser.drop_object_statement import DropObjectStatement from evadb.parser.explain_statement import ExplainStatement @@ -54,7 +54,7 @@ from evadb.parser.show_statement import ShowStatement from evadb.parser.statement import AbstractStatement from evadb.parser.table_ref import TableRef -from evadb.parser.types import UDFType +from evadb.parser.types import FunctionType from evadb.utils.logging_manager import logger @@ -76,7 +76,7 @@ def visit_table_ref(self, table_ref: TableRef): elif table_ref.is_table_valued_expr(): tve = table_ref.table_valued_expr - if tve.func_expr.name.lower() == str(UDFType.EXTRACT_OBJECT).lower(): + if tve.func_expr.name.lower() == str(FunctionType.EXTRACT_OBJECT).lower(): self._plan = LogicalExtractObject( detector=tve.func_expr.children[1], tracker=tve.func_expr.children[2], @@ -254,31 +254,33 @@ def visit_rename(self, statement: RenameTableStatement): rename_opr = LogicalRename(statement.old_table_ref, statement.new_table_name) self._plan = rename_opr - def visit_create_udf(self, statement: CreateUDFStatement): - """Converter for parsed create udf statement + def visit_create_function(self, statement: CreateFunctionStatement): + """Converter for parsed create function statement Arguments: - statement {CreateUDFStatement} - - Create UDF Statement + statement {CreateFunctionStatement} - - CreateFunctionStatement """ - annotated_inputs = column_definition_to_udf_io(statement.inputs, True) - annotated_outputs = column_definition_to_udf_io(statement.outputs, False) - annotated_metadata = metadata_definition_to_udf_metadata(statement.metadata) + annotated_inputs = column_definition_to_function_io(statement.inputs, True) + annotated_outputs = column_definition_to_function_io(statement.outputs, False) + annotated_metadata = metadata_definition_to_function_metadata( + statement.metadata + ) - create_udf_opr = LogicalCreateUDF( + create_function_opr = LogicalCreateFunction( statement.name, statement.if_not_exists, annotated_inputs, annotated_outputs, statement.impl_path, - statement.udf_type, + statement.function_type, annotated_metadata, ) if statement.query is not None: self.visit_select(statement.query) - create_udf_opr.append_child(self._plan) + create_function_opr.append_child(self._plan) - self._plan = create_udf_opr + self._plan = create_function_opr def visit_drop_object(self, statement: DropObjectStatement): self._plan = LogicalDropObject( @@ -312,7 +314,7 @@ def visit_create_index(self, statement: CreateIndexStatement): statement.table_ref, statement.col_list, statement.vector_store_type, - statement.udf_func, + statement.function, ) self._plan = create_index_opr @@ -339,8 +341,8 @@ def visit(self, statement: AbstractStatement): self.visit_create(statement) elif isinstance(statement, RenameTableStatement): self.visit_rename(statement) - elif isinstance(statement, CreateUDFStatement): - self.visit_create_udf(statement) + elif isinstance(statement, CreateFunctionStatement): + self.visit_create_function(statement) elif isinstance(statement, DropObjectStatement): self.visit_drop_object(statement) elif isinstance(statement, LoadDataStatement): diff --git a/evadb/parser/create_udf_statement.py b/evadb/parser/create_function_statement.py similarity index 71% rename from evadb/parser/create_udf_statement.py rename to evadb/parser/create_function_statement.py index baf977b2d2..bcf3318f3d 100644 --- a/evadb/parser/create_udf_statement.py +++ b/evadb/parser/create_function_statement.py @@ -21,29 +21,29 @@ from evadb.parser.types import StatementType -class CreateUDFStatement(AbstractStatement): - """Create UDF Statement constructed after parsing the input query +class CreateFunctionStatement(AbstractStatement): + """CreateFunctionStatement constructed after parsing the input query Attributes: name: str - udf_name provided by the user required + function_name provided by the user required if_not_exists: bool - if true should throw an error if udf with same name exists + if true should throw an error if function with same name exists else will replace the existing inputs: List[ColumnDefinition] - udf inputs, represented similar to a table column definition + function inputs, represented similar to a table column definition outputs: List[ColumnDefinition] - udf outputs, represented similar to a table column definition + function outputs, represented similar to a table column definition impl_file_path: str - file path which holds the implementation of the udf. - This file should be placed in the UDF directory and - the path provided should be relative to the UDF dir. + file path which holds the implementation of the function. + This file should be placed in the function directory and + the path provided should be relative to the function dir. query: SelectStatement data source for the model train or fine tune. - udf_type: str - udf type. it can be object detection, classification etc. + function_type: str + function type. it can be object detection, classification etc. metadata: List[Tuple[str, str]] - metadata, list of key value pairs used for storing metadata of udfs, mostly used for advanced udf types + metadata, list of key value pairs used for storing metadata of functions, mostly used for advanced function types """ def __init__( @@ -53,22 +53,22 @@ def __init__( impl_path: str, inputs: List[ColumnDefinition] = [], outputs: List[ColumnDefinition] = [], - udf_type: str = None, + function_type: str = None, query: SelectStatement = None, metadata: List[Tuple[str, str]] = None, ): - super().__init__(StatementType.CREATE_UDF) + super().__init__(StatementType.CREATE_FUNCTION) self._name = name self._if_not_exists = if_not_exists self._inputs = inputs self._outputs = outputs self._impl_path = Path(impl_path) if impl_path else None - self._udf_type = udf_type + self._function_type = function_type self._query = query self._metadata = metadata def __str__(self) -> str: - s = "CREATE UDF" + s = "CREATE FUNCTION" if self._if_not_exists: s += " IF NOT EXISTS" @@ -78,15 +78,17 @@ def __str__(self) -> str: if self._query is not None: s += f" FROM ({self._query})" - if self._udf_type is not None: - s += " TYPE " + str(self._udf_type) + if self._function_type is not None: + s += " TYPE " + str(self._function_type) if self._impl_path: s += f" IMPL {self._impl_path.name}" if self._metadata is not None: for key, value in self._metadata: - s += f" '{key}' '{value}'" + # NOTE :- Removing quotes around key and making it upper case + # Since in tests we are doing a straight string comparison + s += f" {key.upper()} '{value}'" return s @property @@ -118,8 +120,8 @@ def impl_path(self): return self._impl_path @property - def udf_type(self): - return self._udf_type + def function_type(self): + return self._function_type @property def query(self): @@ -130,7 +132,7 @@ def metadata(self): return self._metadata def __eq__(self, other): - if not isinstance(other, CreateUDFStatement): + if not isinstance(other, CreateFunctionStatement): return False return ( self.name == other.name @@ -138,7 +140,7 @@ def __eq__(self, other): and self.inputs == other.inputs and self.outputs == other.outputs and self.impl_path == other.impl_path - and self.udf_type == other.udf_type + and self.function_type == other.function_type and self.query == other.query and self.metadata == other.metadata ) @@ -152,7 +154,7 @@ def __hash__(self) -> int: tuple(self.inputs), tuple(self.outputs), self.impl_path, - self.udf_type, + self.function_type, self.query, tuple(self.metadata), ) diff --git a/evadb/parser/create_index_statement.py b/evadb/parser/create_index_statement.py index 0d508e7bef..eac90fc5cf 100644 --- a/evadb/parser/create_index_statement.py +++ b/evadb/parser/create_index_statement.py @@ -29,20 +29,20 @@ def __init__( table_ref: TableRef, col_list: List[ColumnDefinition], vector_store_type: VectorStoreType, - udf_func: FunctionExpression = None, + function: FunctionExpression = None, ): super().__init__(StatementType.CREATE_INDEX) self._name = name self._table_ref = table_ref self._col_list = col_list self._vector_store_type = vector_store_type - self._udf_func = udf_func + self._function = function def __str__(self) -> str: print_str = "CREATE INDEX {} ON {} ({}{}) ".format( self._name, self._table_ref, - "" if self._udf_func else self._udf_func, + "" if self._function else self._function, tuple(self._col_list), ) return print_str @@ -64,8 +64,8 @@ def vector_store_type(self): return self._vector_store_type @property - def udf_func(self): - return self._udf_func + def function(self): + return self._function def __eq__(self, other): if not isinstance(other, CreateIndexStatement): @@ -75,7 +75,7 @@ def __eq__(self, other): and self._table_ref == other.table_ref and self.col_list == other.col_list and self._vector_store_type == other.vector_store_type - and self._udf_func == other.udf_func + and self._function == other.function ) def __hash__(self) -> int: @@ -86,6 +86,6 @@ def __hash__(self) -> int: self._table_ref, tuple(self.col_list), self._vector_store_type, - self._udf_func, + self._function, ) ) diff --git a/evadb/parser/drop_object_statement.py b/evadb/parser/drop_object_statement.py index eec95ec528..f88e2ead92 100644 --- a/evadb/parser/drop_object_statement.py +++ b/evadb/parser/drop_object_statement.py @@ -25,7 +25,7 @@ class DropObjectStatement(AbstractStatement): name (str name of the object to drop if_exists: bool - if false, throws an error when no UDF with name exists + if false, throws an error when no function with name exists else logs a warning """ diff --git a/evadb/parser/evadb.lark b/evadb/parser/evadb.lark index ac15951b58..a958dd54c5 100644 --- a/evadb/parser/evadb.lark +++ b/evadb/parser/evadb.lark @@ -4,8 +4,8 @@ start: (sql_statement? ";")+ sql_statement: ddl_statement | dml_statement | utility_statement | context_statement -ddl_statement: create_database | create_table | create_index | create_udf - | drop_database | drop_table | drop_udf | drop_index | rename_table +ddl_statement: create_database | create_table | create_index | create_function + | drop_database | drop_table | drop_function | drop_index | rename_table dml_statement: select_statement | insert_statement | update_statement | delete_statement | load_statement @@ -34,24 +34,24 @@ create_table: CREATE TABLE if_not_exists? table_name (create_definitions | (AS s rename_table: RENAME TABLE table_name TO table_name -// Create UDFs -create_udf: CREATE UDF if_not_exists? udf_name INPUT create_definitions OUTPUT create_definitions TYPE udf_type IMPL udf_impl udf_metadata* - | CREATE UDF if_not_exists? udf_name IMPL udf_impl udf_metadata* - | CREATE UDF if_not_exists? udf_name TYPE udf_type udf_metadata* - | CREATE UDF if_not_exists? udf_name FROM LR_BRACKET select_statement RR_BRACKET TYPE udf_type udf_metadata* +// Create Functions +create_function: CREATE FUNCTION if_not_exists? function_name INPUT create_definitions OUTPUT create_definitions TYPE function_type IMPL function_impl function_metadata* + | CREATE FUNCTION if_not_exists? function_name IMPL function_impl function_metadata* + | CREATE FUNCTION if_not_exists? function_name TYPE function_type function_metadata* + | CREATE FUNCTION if_not_exists? function_name FROM LR_BRACKET select_statement RR_BRACKET TYPE function_type function_metadata* // Details -udf_name: uid +function_name: uid -udf_type: uid +function_type: uid -udf_impl: string_literal +function_impl: string_literal -udf_metadata: udf_metadata_key udf_metadata_value +function_metadata: function_metadata_key function_metadata_value -udf_metadata_key: string_literal +function_metadata_key: uid -udf_metadata_value: string_literal | decimal_literal +function_metadata_value: string_literal | decimal_literal vector_store_type: USING (FAISS | QDRANT) @@ -77,7 +77,7 @@ drop_index: DROP INDEX if_exists? uid drop_table: DROP TABLE if_exists? uid -drop_udf: DROP UDF if_exists? uid +drop_function: DROP FUNCTION if_exists? uid // Data Manipulation Language @@ -171,7 +171,7 @@ describe_statement: DESCRIBE table_name help_statement: HELP STRING_LITERAL -show_statement: SHOW (UDFS | TABLES) +show_statement: SHOW (FUNCTIONS | TABLES) explain_statement: EXPLAIN explainable_statement @@ -267,10 +267,10 @@ if_not_exists: IF NOT EXISTS // Functions -function_call: udf_function ->udf_function_call +function_call: function ->function_call | aggregate_windowed_function ->aggregate_function_call -udf_function: simple_id "(" (STAR | function_args) ")" dotted_id? +function: simple_id "(" (STAR | function_args) ")" dotted_id? aggregate_windowed_function: aggregate_function_name "(" function_arg ")" | COUNT "(" (STAR | function_arg) ")" @@ -386,7 +386,7 @@ TABLE: "TABLE"i TABLES: "TABLES"i TO: "TO"i TRUE: "TRUE"i -UDFS: "UDFS"i +FUNCTIONS: "FUNCTIONS"i UNION: "UNION"i UNIQUE: "UNIQUE"i UNKNOWN: "UNKNOWN"i @@ -459,8 +459,8 @@ HELP: "HELP"i TEMPTABLE: "TEMPTABLE"i VALUE: "VALUE"i -// UDF -UDF: "UDF"i +// Function +FUNCTION: "FUNCTION"i INPUT: "INPUT"i OUTPUT: "OUTPUT"i TYPE: "TYPE"i diff --git a/evadb/parser/lark_visitor/_create_statements.py b/evadb/parser/lark_visitor/_create_statements.py index 07f06ba2f8..536efe0199 100644 --- a/evadb/parser/lark_visitor/_create_statements.py +++ b/evadb/parser/lark_visitor/_create_statements.py @@ -261,10 +261,10 @@ def create_index(self, tree): elif child.data == "index_elem": index_elem = self.visit(child) - # Parse either a single UDF function call or column list. - col_list, udf_func = None, None + # Parse either a single function call or column list. + col_list, function = None, None if not isinstance(index_elem, list): - udf_func = index_elem + function = index_elem # Traverse to the tuple value expression. while not isinstance(index_elem, TupleValueExpression): @@ -276,7 +276,7 @@ def create_index(self, tree): ] return CreateIndexStatement( - index_name, table_ref, col_list, vector_store_type, udf_func + index_name, table_ref, col_list, vector_store_type, function ) diff --git a/evadb/parser/lark_visitor/_drop_statement.py b/evadb/parser/lark_visitor/_drop_statement.py index 1b3a06c2b0..fbf922fb36 100644 --- a/evadb/parser/lark_visitor/_drop_statement.py +++ b/evadb/parser/lark_visitor/_drop_statement.py @@ -46,16 +46,16 @@ def drop_index(self, tree): return DropObjectStatement(ObjectType.INDEX, index_name, if_exists) - # Drop UDF - def drop_udf(self, tree): - udf_name = None + # Drop Function + def drop_function(self, tree): + function_name = None if_exists = False for child in tree.children: if isinstance(child, Tree): if child.data == "uid": - udf_name = self.visit(child) + function_name = self.visit(child) elif child.data == "if_exists": if_exists = True - return DropObjectStatement(ObjectType.UDF, udf_name, if_exists) + return DropObjectStatement(ObjectType.FUNCTION, function_name, if_exists) diff --git a/evadb/parser/lark_visitor/_functions.py b/evadb/parser/lark_visitor/_functions.py index d8f0503de5..6c354cac19 100644 --- a/evadb/parser/lark_visitor/_functions.py +++ b/evadb/parser/lark_visitor/_functions.py @@ -20,32 +20,32 @@ from evadb.expression.constant_value_expression import ConstantValueExpression from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression -from evadb.parser.create_udf_statement import CreateUDFStatement +from evadb.parser.create_function_statement import CreateFunctionStatement ################################################################## -# Functions - UDFs, Aggregate Windowed functions +# Functions - Functions, Aggregate Windowed functions ################################################################## class Functions: - def udf_function(self, tree): - udf_name = None - udf_output = None - udf_args = None + def function(self, tree): + function_name = None + function_output = None + function_args = None for child in tree.children: if isinstance(child, Token): if child.value == "*": - udf_args = [TupleValueExpression(name="*")] + function_args = [TupleValueExpression(name="*")] if isinstance(child, Tree): if child.data == "simple_id": - udf_name = self.visit(child) + function_name = self.visit(child) elif child.data == "dotted_id": - udf_output = self.visit(child) + function_output = self.visit(child) elif child.data == "function_args": - udf_args = self.visit(child) + function_args = self.visit(child) - func_expr = FunctionExpression(None, name=udf_name, output=udf_output) - for arg in udf_args: + func_expr = FunctionExpression(None, name=function_name, output=function_output) + for arg in function_args: func_expr.append_child(arg) return func_expr @@ -57,55 +57,57 @@ def function_args(self, tree): args.append(self.visit(child)) return args - # Create UDF - def create_udf(self, tree): - udf_name = None + # Create function + def create_function(self, tree): + function_name = None if_not_exists = False input_definitions = [] output_definitions = [] impl_path = None - udf_type = None + function_type = None query = None metadata = [] create_definitions_index = 0 for child in tree.children: if isinstance(child, Tree): - if child.data == "udf_name": - udf_name = self.visit(child) + if child.data == "function_name": + function_name = self.visit(child) elif child.data == "if_not_exists": if_not_exists = True elif child.data == "create_definitions": # There should be 2 createDefinition - # idx 0 describing udf INPUT - # idx 1 describing udf OUTPUT + # idx 0 describing function INPUT + # idx 1 describing function OUTPUT if create_definitions_index == 0: input_definitions = self.visit(child) create_definitions_index += 1 elif create_definitions_index == 1: output_definitions = self.visit(child) - elif child.data == "udf_type": - udf_type = self.visit(child) - elif child.data == "udf_impl": + elif child.data == "function_type": + function_type = self.visit(child) + elif child.data == "function_impl": impl_path = self.visit(child).value elif child.data == "simple_select": query = self.visit(child) - elif child.data == "udf_metadata": - # Each UDF metadata is a key value pair + elif child.data == "function_metadata": + # Each function metadata is a key value pair key_value_pair = self.visit(child) # value can be an integer or string value = key_value_pair[1] if isinstance(value, ConstantValueExpression): value = value.value - metadata.append((key_value_pair[0].value, value)), + # Removing .value from key_value_pair[0] since key is now an ID_LITERAL + # Adding lower() to ensure the key is in lowercase + metadata.append((key_value_pair[0].lower(), value)), - return CreateUDFStatement( - udf_name, + return CreateFunctionStatement( + function_name, if_not_exists, impl_path, input_definitions, output_definitions, - udf_type, + function_type, query, metadata, ) diff --git a/evadb/parser/lark_visitor/_show_statements.py b/evadb/parser/lark_visitor/_show_statements.py index 5994841419..b30d29b5ac 100644 --- a/evadb/parser/lark_visitor/_show_statements.py +++ b/evadb/parser/lark_visitor/_show_statements.py @@ -23,7 +23,7 @@ class Show: def show_statement(self, tree): token = tree.children[1] - if str.upper(token) == "UDFS": - return ShowStatement(show_type=ShowType.UDFS) + if str.upper(token) == "FUNCTIONS": + return ShowStatement(show_type=ShowType.FUNCTIONS) elif str.upper(token) == "TABLES": return ShowStatement(show_type=ShowType.TABLES) diff --git a/evadb/parser/show_statement.py b/evadb/parser/show_statement.py index ff176983ad..0516bc8639 100644 --- a/evadb/parser/show_statement.py +++ b/evadb/parser/show_statement.py @@ -29,8 +29,8 @@ def show_type(self): def __str__(self): show_str = "" - if self.show_type == ShowType.UDFS: - show_str = "UDFS" + if self.show_type == ShowType.FUNCTIONS: + show_str = "FUNCTIONS" elif self.show_type == ShowType.TABLES: show_str = "TABLES" diff --git a/evadb/parser/types.py b/evadb/parser/types.py index 18b770fa8b..0abebcb097 100644 --- a/evadb/parser/types.py +++ b/evadb/parser/types.py @@ -34,7 +34,7 @@ class StatementType(EvaDBEnum): DROP_OBJECT # noqa: F821 INSERT # noqa: F821 DELETE # noqa: F821 - CREATE_UDF # noqa: F821 + CREATE_FUNCTION # noqa: F821 LOAD_DATA # noqa: F821 SHOW # noqa: F821 EXPLAIN # noqa: F821 @@ -67,15 +67,15 @@ class FileFormatType(EvaDBEnum): class ShowType(EvaDBEnum): - UDFS # noqa: F821 + FUNCTIONS # noqa: F821 TABLES # noqa: F821 -class UDFType(EvaDBEnum): +class FunctionType(EvaDBEnum): EXTRACT_OBJECT # noqa: F821 class ObjectType(EvaDBEnum): TABLE # noqa: F821 - UDF # noqa: F821 + FUNCTION # noqa: F821 INDEX # noqa: F821 diff --git a/evadb/parser/utils.py b/evadb/parser/utils.py index a4f0dec91f..70db55cecc 100644 --- a/evadb/parser/utils.py +++ b/evadb/parser/utils.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from evadb.parser.create_function_statement import CreateFunctionStatement from evadb.parser.create_statement import CreateDatabaseStatement, CreateTableStatement -from evadb.parser.create_udf_statement import CreateUDFStatement from evadb.parser.drop_object_statement import DropObjectStatement from evadb.parser.explain_statement import ExplainStatement from evadb.parser.insert_statement import InsertTableStatement @@ -58,25 +58,31 @@ def parse_table_clause(expr: str, chunk_size: int = None, chunk_overlap: int = N return stmt.from_table -def parse_create_udf( - udf_name: str, if_not_exists: bool, udf_file_path: str, type: str, **kwargs +def parse_create_function( + function_name: str, + if_not_exists: bool, + function_file_path: str, + type: str, + **kwargs, ): mock_query = ( - f"CREATE UDF IF NOT EXISTS {udf_name}" + f"CREATE FUNCTION IF NOT EXISTS {function_name}" if if_not_exists - else f"CREATE UDF {udf_name}" + else f"CREATE FUNCTION {function_name}" ) if type is not None: mock_query += f" TYPE {type}" task, model = kwargs["task"], kwargs["model"] if task is not None and model is not None: - mock_query += f" 'task' '{task}' 'model' '{model}'" + mock_query += f" TASK '{task}' MODEL '{model}'" else: - mock_query += f" IMPL '{udf_file_path}'" + mock_query += f" IMPL '{function_file_path}'" mock_query += ";" stmt = Parser().parse(mock_query)[0] - assert isinstance(stmt, CreateUDFStatement), "Expected a create udf statement" + assert isinstance( + stmt, CreateFunctionStatement + ), "Expected a create function statement" return stmt @@ -135,8 +141,8 @@ def parse_drop_table(table_name: str, if_exists: bool): return parse_drop(ObjectType.TABLE, table_name, if_exists) -def parse_drop_udf(udf_name: str, if_exists: bool): - return parse_drop(ObjectType.UDF, udf_name, if_exists) +def parse_drop_function(function_name: str, if_exists: bool): + return parse_drop(ObjectType.FUNCTION, function_name, if_exists) def parse_drop_index(index_name: str, if_exists: bool): diff --git a/evadb/plan_nodes/create_udf_plan.py b/evadb/plan_nodes/create_function_plan.py similarity index 65% rename from evadb/plan_nodes/create_udf_plan.py rename to evadb/plan_nodes/create_function_plan.py index b39146b037..e05ef18914 100644 --- a/evadb/plan_nodes/create_udf_plan.py +++ b/evadb/plan_nodes/create_function_plan.py @@ -15,49 +15,49 @@ from pathlib import Path from typing import List -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry -from evadb.catalog.models.udf_metadata_catalog import UdfMetadataCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry from evadb.plan_nodes.abstract_plan import AbstractPlan from evadb.plan_nodes.types import PlanOprType -class CreateUDFPlan(AbstractPlan): +class CreateFunctionPlan(AbstractPlan): """ - This plan is used for storing information required to create udf operators + This plan is used for storing information required to create function operators Attributes: name: str - udf_name provided by the user required + function_name provided by the user required if_not_exists: bool - if true should throw an error if udf with same name exists + if true should throw an error if function with same name exists else will replace the existing - inputs: List[UdfIOCatalogEntry] - udf inputs, annotated list similar to table columns - outputs: List[UdfIOCatalogEntry] - udf outputs, annotated list similar to table columns + inputs: List[FunctionIOCatalogEntry] + function inputs, annotated list similar to table columns + outputs: List[FunctionIOCatalogEntry] + function outputs, annotated list similar to table columns impl_file_path: Path - file path which holds the implementation of the udf. - udf_type: str - udf type. it ca be object detection, classification etc. + file path which holds the implementation of the function. + function_type: str + function type. it ca be object detection, classification etc. """ def __init__( self, name: str, if_not_exists: bool, - inputs: List[UdfIOCatalogEntry], - outputs: List[UdfIOCatalogEntry], + inputs: List[FunctionIOCatalogEntry], + outputs: List[FunctionIOCatalogEntry], impl_file_path: Path, - udf_type: str = None, - metadata: List[UdfMetadataCatalogEntry] = None, + function_type: str = None, + metadata: List[FunctionMetadataCatalogEntry] = None, ): - super().__init__(PlanOprType.CREATE_UDF) + super().__init__(PlanOprType.CREATE_FUNCTION) self._name = name self._if_not_exists = if_not_exists self._inputs = inputs self._outputs = outputs self._impl_path = impl_file_path - self._udf_type = udf_type + self._function_type = function_type self._metadata = metadata @property @@ -81,27 +81,27 @@ def impl_path(self): return self._impl_path @property - def udf_type(self): - return self._udf_type + def function_type(self): + return self._function_type @property def metadata(self): return self._metadata def __str__(self): - return "CreateUDFPlan(name={}, \ + return "CreateFunctionPlan(name={}, \ if_not_exists={}, \ inputs={}, \ outputs={}, \ impl_file_path={}, \ - udf_type={}, \ + function_type={}, \ metadata={})".format( self._name, self._if_not_exists, self._inputs, self._outputs, self._impl_path, - self._udf_type, + self._function_type, self._metadata, ) @@ -113,7 +113,7 @@ def __hash__(self) -> int: tuple(self.inputs), tuple(self.outputs), self.impl_path, - self.udf_type, + self.function_type, tuple(self.metadata), ) ) diff --git a/evadb/plan_nodes/create_index_plan.py b/evadb/plan_nodes/create_index_plan.py index 9449cf53f1..adbdccce5a 100644 --- a/evadb/plan_nodes/create_index_plan.py +++ b/evadb/plan_nodes/create_index_plan.py @@ -29,14 +29,14 @@ def __init__( table_ref: TableRef, col_list: List[ColumnDefinition], vector_store_type: VectorStoreType, - udf_func: FunctionExpression = None, + function: FunctionExpression = None, ): super().__init__(PlanOprType.CREATE_INDEX) self._name = name self._table_ref = table_ref self._col_list = col_list self._vector_store_type = vector_store_type - self._udf_func = udf_func + self._function = function @property def name(self): @@ -55,8 +55,8 @@ def vector_store_type(self): return self._vector_store_type @property - def udf_func(self): - return self._udf_func + def function(self): + return self._function def __str__(self): return "CreateIndexPlan(name={}, \ @@ -68,7 +68,7 @@ def __str__(self): self._table_ref, tuple(self._col_list), self._vector_store_type, - "" if not self._udf_func else "udf_func={}".format(self._udf_func), + "" if not self._function else "function={}".format(self._function), ) def __hash__(self) -> int: @@ -79,6 +79,6 @@ def __hash__(self) -> int: self.table_ref, tuple(self.col_list), self.vector_store_type, - self.udf_func, + self.function, ) ) diff --git a/evadb/plan_nodes/show_info_plan.py b/evadb/plan_nodes/show_info_plan.py index 0b09646050..da8c4c2813 100644 --- a/evadb/plan_nodes/show_info_plan.py +++ b/evadb/plan_nodes/show_info_plan.py @@ -27,8 +27,8 @@ def show_type(self): return self._show_type def __str__(self): - if self._show_type == ShowType.UDFS: - return "ShowUDFPlan" + if self._show_type == ShowType.FUNCTIONS: + return "ShowFunctionPlan" else: return "ShowTablePlan" diff --git a/evadb/plan_nodes/types.py b/evadb/plan_nodes/types.py index f7e43117e8..7b989e8e89 100644 --- a/evadb/plan_nodes/types.py +++ b/evadb/plan_nodes/types.py @@ -26,7 +26,7 @@ class PlanOprType(Enum): CREATE = auto() RENAME = auto() DROP_OBJECT = auto() - CREATE_UDF = auto() + CREATE_FUNCTION = auto() LOAD_DATA = auto() UNION = auto() GROUP_BY = auto() diff --git a/evadb/server/server.py b/evadb/server/server.py index b4bd68d2cb..0105279a32 100644 --- a/evadb/server/server.py +++ b/evadb/server/server.py @@ -17,7 +17,7 @@ from asyncio import StreamReader, StreamWriter from evadb.database import init_evadb_instance -from evadb.udfs.udf_bootstrap_queries import init_builtin_udfs +from evadb.functions.function_bootstrap_queries import init_builtin_functions from evadb.utils.logging_manager import logger @@ -48,9 +48,9 @@ async def start_evadb_server( self._server = await asyncio.start_server(self.accept_client, host, port) - # load built-in udfs + # load built-in functions mode = self._evadb.config.get_value("core", "mode") - init_builtin_udfs(self._evadb, mode=mode) + init_builtin_functions(self._evadb, mode=mode) async with self._server: await self._server.serve_forever() diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index 4f285adcec..f49403b92e 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -15,10 +15,6 @@ import importlib import os -import pip - -INSTALL_CACHE = [] - def get_database_handler(engine: str, **kwargs): """ @@ -26,41 +22,25 @@ def get_database_handler(engine: str, **kwargs): their new integrated handlers. """ - # Dynamically install dependencies. - dynamic_install(engine) - # Dynamically import the top module. - mod = dynamic_import(engine) + try: + mod = dynamic_import(engine) + except ImportError: + req_file = os.path.join(os.path.dirname(__file__), engine, "requirements.txt") + if os.path.isfile(req_file): + with open(req_file) as f: + raise ImportError(f"Please install the following packages {f.read()}") if engine == "postgres": return mod.PostgresHandler(engine, **kwargs) + elif engine == "sqlite": + return mod.SQLiteHandler(engine, **kwargs) + elif engine == "mysql": + return mod.MysqlHandler(engine, **kwargs) else: raise NotImplementedError(f"Engine {engine} is not supported") -def dynamic_install(handler_dir): - """ - Dynamically install package from requirements.txt. - """ - - # Skip installation - if handler_dir in INSTALL_CACHE: - return - - INSTALL_CACHE.append(handler_dir) - - req_file = os.path.join( - "evadb", "third_party", "databases", handler_dir, "requirements.txt" - ) - if os.path.isfile(req_file): - with open(req_file) as f: - for package in f.read().splitlines(): - if hasattr(pip, "main"): - pip.main(["install", package]) - else: - pip._internal.main(["install", package]) - - def dynamic_import(handler_dir): import_path = f"evadb.third_party.databases.{handler_dir}.{handler_dir}_handler" return importlib.import_module(import_path) diff --git a/test/integration_tests/long/udfs/ndarray/__init__.py b/evadb/third_party/databases/mysql/__init__.py similarity index 90% rename from test/integration_tests/long/udfs/ndarray/__init__.py rename to evadb/third_party/databases/mysql/__init__.py index 64a9fd8c1f..7a495f2ed8 100644 --- a/test/integration_tests/long/udfs/ndarray/__init__.py +++ b/evadb/third_party/databases/mysql/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""user defined test functions operating on ndarrays udfs""" +"""MySQL integrations""" diff --git a/evadb/third_party/databases/mysql/mysql_handler.py b/evadb/third_party/databases/mysql/mysql_handler.py new file mode 100644 index 0000000000..829cc243c8 --- /dev/null +++ b/evadb/third_party/databases/mysql/mysql_handler.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import mysql.connector +import pandas as pd + +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class MysqlHandler(DBHandler): + def __init__(self, name: str, **kwargs): + super().__init__(name) + self.host = kwargs.get("host") + self.port = kwargs.get("port") + self.user = kwargs.get("user") + self.password = kwargs.get("password") + self.database = kwargs.get("database") + + def connect(self): + try: + self.connection = mysql.connector.connect( + host=self.host, + port=self.port, + user=self.user, + password=self.password, + database=self.database, + ) + self.connection.autocommit = True + return DBHandlerStatus(status=True) + except mysql.connector.Error as e: + return DBHandlerStatus(status=False, error=str(e)) + + def disconnect(self): + if self.connection: + self.connection.close() + + def check_connection(self) -> DBHandlerStatus: + if self.connection: + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the database.") + + def get_tables(self) -> DBHandlerResponse: + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + + try: + query = f"SELECT table_name as 'table_name' FROM information_schema.tables WHERE table_schema='{self.database}'" + tables_df = pd.read_sql_query(query, self.connection) + return DBHandlerResponse(data=tables_df) + except mysql.connector.Error as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + + try: + query = f"SELECT column_name as 'name', data_type as dtype FROM information_schema.columns WHERE table_name='{table_name}'" + columns_df = pd.read_sql_query(query, self.connection) + columns_df["dtype"] = columns_df["dtype"].apply(self._mysql_to_python_types) + return DBHandlerResponse(data=columns_df) + except mysql.connector.Error as e: + return DBHandlerResponse(data=None, error=str(e)) + + def _fetch_results_as_df(self, cursor): + """ + This is currently the only clean solution that we have found so far. + Reference to MySQL API: https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysqlcursor-fetchall.html + In short, currently there is no very clean programming way to differentiate + CREATE, INSERT, SELECT. CREATE and INSERT do not return any result, so calling + fetchall() on those will yield a programming error. Cursor has an attribute + rowcount, but it indicates # of rows that are affected. In that case, for both + INSERT and SELECT rowcount is not 0, so we also cannot use this API to + differentiate INSERT and SELECT. + """ + try: + res = cursor.fetchall() + if not res: + return pd.DataFrame({"status": ["success"]}) + res_df = pd.DataFrame(res, columns=[desc[0] for desc in cursor.description]) + return res_df + except mysql.connector.ProgrammingError as e: + if str(e) == "no results to fetch": + return pd.DataFrame({"status": ["success"]}) + raise e + + def execute_native_query(self, query_string: str) -> DBHandlerResponse: + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + + try: + cursor = self.connection.cursor() + cursor.execute(query_string) + return DBHandlerResponse(data=self._fetch_results_as_df(cursor)) + except mysql.connector.Error as e: + return DBHandlerResponse(data=None, error=str(e)) + + def _mysql_to_python_types(self, mysql_type: str): + mapping = { + "char": str, + "varchar": str, + "text": str, + "boolean": bool, + "integer": int, + "int": int, + "float": float, + "double": float, + # Add more mappings as needed + } + + if mysql_type in mapping: + return mapping[mysql_type] + else: + raise Exception( + f"Unsupported column {mysql_type} encountered in the mysql table. Please raise a feature request!" + ) diff --git a/evadb/third_party/databases/mysql/requirements.txt b/evadb/third_party/databases/mysql/requirements.txt new file mode 100644 index 0000000000..5bb5574ad5 --- /dev/null +++ b/evadb/third_party/databases/mysql/requirements.txt @@ -0,0 +1 @@ +mysql-connector-python \ No newline at end of file diff --git a/evadb/third_party/databases/postgres/postgres_handler.py b/evadb/third_party/databases/postgres/postgres_handler.py index 4721536e2e..dda08f7da8 100644 --- a/evadb/third_party/databases/postgres/postgres_handler.py +++ b/evadb/third_party/databases/postgres/postgres_handler.py @@ -24,14 +24,26 @@ class PostgresHandler(DBHandler): def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ super().__init__(name) self.host = kwargs.get("host") self.port = kwargs.get("port") self.user = kwargs.get("user") self.password = kwargs.get("password") self.database = kwargs.get("database") + self.connection = None - def connect(self): + def connect(self) -> DBHandlerStatus: + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ try: self.connection = psycopg2.connect( host=self.host, @@ -46,16 +58,29 @@ def connect(self): return DBHandlerStatus(status=False, error=str(e)) def disconnect(self): + """ + Close any existing connections. + """ if self.connection: self.connection.close() def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ if self.connection: return DBHandlerStatus(status=True) else: return DBHandlerStatus(status=False, error="Not connected to the database.") def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ if not self.connection: return DBHandlerResponse(data=None, error="Not connected to the database.") @@ -67,12 +92,20 @@ def get_tables(self) -> DBHandlerResponse: return DBHandlerResponse(data=None, error=str(e)) def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ if not self.connection: return DBHandlerResponse(data=None, error="Not connected to the database.") try: - query = f"SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}'" + query = f"SELECT column_name as name, data_type as dtype FROM information_schema.columns WHERE table_name='{table_name}'" columns_df = pd.read_sql_query(query, self.connection) + columns_df["dtype"] = columns_df["dtype"].apply(self._pg_to_python_types) return DBHandlerResponse(data=columns_df) except psycopg2.Error as e: return DBHandlerResponse(data=None, error=str(e)) @@ -99,6 +132,13 @@ def _fetch_results_as_df(self, cursor): raise e def execute_native_query(self, query_string: str) -> DBHandlerResponse: + """ + Executes the native query on the database. + Args: + query_string (str): query in native format + Returns: + DBHandlerResponse + """ if not self.connection: return DBHandlerResponse(data=None, error="Not connected to the database.") @@ -108,3 +148,25 @@ def execute_native_query(self, query_string: str) -> DBHandlerResponse: return DBHandlerResponse(data=self._fetch_results_as_df(cursor)) except psycopg2.Error as e: return DBHandlerResponse(data=None, error=str(e)) + + def _pg_to_python_types(self, pg_type: str): + mapping = { + "integer": int, + "bigint": int, + "smallint": int, + "numeric": float, + "real": float, + "double precision": float, + "character": str, + "character varying": str, + "text": str, + "boolean": bool, + # Add more mappings as needed + } + + if pg_type in mapping: + return mapping[pg_type] + else: + raise Exception( + f"Unsupported column {pg_type} encountered in the postgres table. Please raise a feature request!" + ) diff --git a/evadb/third_party/databases/sqlite/__init__.py b/evadb/third_party/databases/sqlite/__init__.py new file mode 100644 index 0000000000..747e9ea65a --- /dev/null +++ b/evadb/third_party/databases/sqlite/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""sqlite integration""" diff --git a/evadb/third_party/databases/sqlite/sqlite_handler.py b/evadb/third_party/databases/sqlite/sqlite_handler.py new file mode 100644 index 0000000000..204db36d25 --- /dev/null +++ b/evadb/third_party/databases/sqlite/sqlite_handler.py @@ -0,0 +1,139 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sqlite3 + +import pandas as pd + +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class SQLiteHandler(DBHandler): + def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ + super().__init__(name) + self.database = kwargs.get("database") + self.connection = None + + def connect(self): + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ + try: + self.connection = sqlite3.connect( + database=self.database, isolation_level=None # Autocommit mode. + ) + return DBHandlerStatus(status=True) + except sqlite3.Error as e: + return DBHandlerStatus(status=False, error=str(e)) + + def disconnect(self): + """ + Close any existing connections. + """ + if self.connection: + self.connection.close() + + def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ + if self.connection: + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the database.") + + def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + + try: + query = "SELECT name AS table_name FROM sqlite_master WHERE type = 'table'" + tables_df = pd.read_sql_query(query, self.connection) + return DBHandlerResponse(data=tables_df) + except sqlite3.Error as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + """ + SQLite does not provide an in-built way to get the column names using a SELECT statement. + Hence we have to use the PRAGMA command and filter the required columns. + """ + try: + query = f"PRAGMA table_info('{table_name}')" + pragma_df = pd.read_sql_query(query, self.connection) + columns_df = pragma_df[["name", "type"]].copy() + columns_df.rename(columns={"type": "dtype"}, inplace=True) + return DBHandlerResponse(data=columns_df) + except sqlite3.Error as e: + return DBHandlerResponse(data=None, error=str(e)) + + def _fetch_results_as_df(self, cursor): + try: + res = cursor.fetchall() + res_df = pd.DataFrame( + res, + columns=[desc[0] for desc in cursor.description] + if cursor.description + else [], + ) + return res_df + except sqlite3.ProgrammingError as e: + if str(e) == "no results to fetch": + return pd.DataFrame({"status": ["success"]}) + raise e + + def execute_native_query(self, query_string: str) -> DBHandlerResponse: + """ + Executes the native query on the database. + Args: + query_string (str): query in native format + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + cursor = self.connection.cursor() + cursor.execute(query_string) + return DBHandlerResponse(data=self._fetch_results_as_df(cursor)) + except sqlite3.Error as e: + return DBHandlerResponse(data=None, error=str(e)) diff --git a/evadb/third_party/databases/types.py b/evadb/third_party/databases/types.py index 5fc547adc2..6708cf1a11 100644 --- a/evadb/third_party/databases/types.py +++ b/evadb/third_party/databases/types.py @@ -55,6 +55,7 @@ class DBHandler: def __init__(self, name: str, **kwargs): self.name = name + self.connection = None def connect(self): """ @@ -105,7 +106,7 @@ def get_columns(self, table_name: str) -> DBHandlerResponse: table_name (str): The name of the table for which to retrieve columns. Returns: - DBHandlerResponse: An instance of DBHandlerResponse containing the columns or an error message. Data is in a pandas DataFrame. + DBHandlerResponse: An instance of DBHandlerResponse containing the columns or an error message. Data is in a pandas DataFrame. It should have the following two columns: name and dtype. The dtype should be a Python dtype and will default to `str`. Raises: NotImplementedError: This method should be implemented in derived classes. diff --git a/evadb/third_party/huggingface/binder.py b/evadb/third_party/huggingface/binder.py index 2181c5c134..5fadc7cc90 100644 --- a/evadb/third_party/huggingface/binder.py +++ b/evadb/third_party/huggingface/binder.py @@ -13,23 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. from evadb.catalog.catalog_utils import get_metadata_entry_or_val -from evadb.catalog.models.udf_catalog import UdfCatalogEntry +from evadb.catalog.models.function_catalog import FunctionCatalogEntry from evadb.third_party.huggingface.create import MODEL_FOR_TASK -def assign_hf_udf(udf_obj: UdfCatalogEntry): +def assign_hf_function(function_obj: FunctionCatalogEntry): """ - Assigns the correct HF Model to the UDF. The model assigned depends on - the task type for the UDF. This is done so that we can + Assigns the correct HF Model to the Function. The model assigned depends on + the task type for the Function. This is done so that we can process the input correctly before passing it to the HF model. """ - inputs = udf_obj.args + inputs = function_obj.args # NOTE: Currently, we only support models that require a single input. assert len(inputs) == 1, "Only single input models are supported." - task = get_metadata_entry_or_val(udf_obj, "task", None) - assert task is not None, "task not specified in Hugging Face UDF" + task = get_metadata_entry_or_val(function_obj, "task", None) + assert task is not None, "task not specified in Hugging Face Function" model_class = MODEL_FOR_TASK[task] - return lambda: model_class(udf_obj) + return lambda: model_class(function_obj) diff --git a/evadb/third_party/huggingface/create.py b/evadb/third_party/huggingface/create.py index fd337e832f..f49488c9f6 100644 --- a/evadb/third_party/huggingface/create.py +++ b/evadb/third_party/huggingface/create.py @@ -17,8 +17,8 @@ import numpy as np from evadb.catalog.catalog_type import ColumnType, NdArrayType -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry -from evadb.catalog.models.udf_metadata_catalog import UdfMetadataCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry +from evadb.catalog.models.function_metadata_catalog import FunctionMetadataCatalogEntry from evadb.third_party.huggingface.model import ( ASRHFModel, AudioHFModel, @@ -102,12 +102,12 @@ def gen_sample_input(input_type: HFInputTypes): return sample_image() elif input_type == HFInputTypes.AUDIO: return sample_audio() - assert False, "Invalid Input Type for UDF" + assert False, "Invalid Input Type for Function" def infer_output_name_and_type(**pipeline_args): """ - Infer the name and type for each output of the HuggingFace UDF + Infer the name and type for each output of the HuggingFace Function """ assert "task" in pipeline_args, "Task Not Found In Model Definition" task = pipeline_args["task"] @@ -139,23 +139,23 @@ def infer_output_name_and_type(**pipeline_args): return input_type, output_types -def io_entry_for_inputs(udf_name: str, udf_input: Union[str, List]): +def io_entry_for_inputs(function_name: str, function_input: Union[str, List]): """ - Generates the IO Catalog Entry for the inputs to HF UDFs + Generates the IO Catalog Entry for the inputs to HF Functions Input is one of ["text", "image", "audio", "video", "multimodal"] """ - if isinstance(udf_input, HFInputTypes): - udf_input = [udf_input] + if isinstance(function_input, HFInputTypes): + function_input = [function_input] inputs = [] - for input_type in udf_input: + for input_type in function_input: array_type = NdArrayType.ANYTYPE if input_type == HFInputTypes.TEXT: array_type = NdArrayType.STR - elif input_type == HFInputTypes.IMAGE or udf_input == HFInputTypes.AUDIO: + elif input_type == HFInputTypes.IMAGE or function_input == HFInputTypes.AUDIO: array_type = NdArrayType.FLOAT32 inputs.append( - UdfIOCatalogEntry( - name=f"{udf_name}_{input_type}", + FunctionIOCatalogEntry( + name=f"{function_name}_{input_type}", type=ColumnType.NDARRAY, is_nullable=False, array_type=array_type, @@ -177,14 +177,14 @@ def ptype_to_ndarray_type(col_type: type): return NdArrayType.ANYTYPE -def io_entry_for_outputs(udf_outputs: Dict[str, Type]): +def io_entry_for_outputs(function_outputs: Dict[str, Type]): """ Generates the IO Catalog Entry for the output """ outputs = [] - for col_name, col_type in udf_outputs.items(): + for col_name, col_type in function_outputs.items(): outputs.append( - UdfIOCatalogEntry( + FunctionIOCatalogEntry( name=col_name, type=ColumnType.NDARRAY, array_type=ptype_to_ndarray_type(col_type), @@ -194,13 +194,15 @@ def io_entry_for_outputs(udf_outputs: Dict[str, Type]): return outputs -def gen_hf_io_catalog_entries(udf_name: str, metadata: List[UdfMetadataCatalogEntry]): +def gen_hf_io_catalog_entries( + function_name: str, metadata: List[FunctionMetadataCatalogEntry] +): """ - Generates IO Catalog Entries for a HuggingFace UDF. + Generates IO Catalog Entries for a HuggingFace Function. The attributes of the huggingface model can be extracted from metadata. """ pipeline_args = {arg.key: arg.value for arg in metadata} - udf_input, udf_output = infer_output_name_and_type(**pipeline_args) - annotated_inputs = io_entry_for_inputs(udf_name, udf_input) - annotated_outputs = io_entry_for_outputs(udf_output) + function_input, function_output = infer_output_name_and_type(**pipeline_args) + annotated_inputs = io_entry_for_inputs(function_name, function_input) + annotated_outputs = io_entry_for_outputs(function_output) return annotated_inputs + annotated_outputs diff --git a/evadb/third_party/huggingface/model.py b/evadb/third_party/huggingface/model.py index 04d182f485..62d9bfc60e 100644 --- a/evadb/third_party/huggingface/model.py +++ b/evadb/third_party/huggingface/model.py @@ -16,7 +16,7 @@ import numpy as np -from evadb.udfs.abstract.hf_abstract_udf import AbstractHFUdf +from evadb.functions.abstract.hf_abstract_function import AbstractHFFunction from evadb.utils.generic_utils import EvaDBEnum, try_to_import_decord @@ -28,7 +28,7 @@ class HFInputTypes(EvaDBEnum): MULTIMODAL_TEXT_IMAGE # noqa: F821 -class TextHFModel(AbstractHFUdf): +class TextHFModel(AbstractHFFunction): """ Base Model for all HF Models that take in text as input """ @@ -43,7 +43,7 @@ def input_formatter(self, inputs: Any): return inputs.values.flatten().tolist() -class ImageHFModel(AbstractHFUdf): +class ImageHFModel(AbstractHFFunction): """ Base Model for all HF Models that take in images as input """ @@ -57,7 +57,7 @@ def input_formatter(self, inputs: Any): return images -class AudioHFModel(AbstractHFUdf): +class AudioHFModel(AbstractHFFunction): """ Base Model for all HF Models that take in audio as input """ diff --git a/evadb/utils/errors.py b/evadb/utils/errors.py index 4349646ce8..dd6b2e2b51 100644 --- a/evadb/utils/errors.py +++ b/evadb/utils/errors.py @@ -16,7 +16,7 @@ class CatalogError(Exception): pass -class UDFIODefinitionError(Exception): +class FunctionIODefinitionError(Exception): pass diff --git a/evadb/utils/generic_utils.py b/evadb/utils/generic_utils.py index 7baaa157c1..9418fb5681 100644 --- a/evadb/utils/generic_utils.py +++ b/evadb/utils/generic_utils.py @@ -63,7 +63,7 @@ def str_to_class(class_path: str): return getattr(module, class_name) -def load_udf_class_from_file(filepath, classname=None): +def load_function_class_from_file(filepath, classname=None): """ Load a class from a Python file. If the classname is not specified, the function will check if there is only one class in the file and load that. If there are multiple classes, it will raise an error. @@ -83,7 +83,7 @@ def load_udf_class_from_file(filepath, classname=None): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) except Exception as e: - err_msg = f"Couldn't load UDF from {filepath} : {str(e)}. This might be due to a missing Python package, or because the UDF implementation file does not exist, or it is not a valid Python file." + err_msg = f"Couldn't load function from {filepath} : {str(e)}. This might be due to a missing Python package, or because the function implementation file does not exist, or it is not a valid Python file." raise RuntimeError(err_msg) # Try to load the specified class by name @@ -98,7 +98,7 @@ def load_udf_class_from_file(filepath, classname=None): ] if len(classes) != 1: raise RuntimeError( - f"{filepath} contains {len(classes)} classes, please specify the correct class to load by naming the UDF with the same name in the CREATE query." + f"{filepath} contains {len(classes)} classes, please specify the correct class to load by naming the function with the same name in the CREATE query." ) return classes[0] @@ -269,6 +269,16 @@ def try_to_import_ray(): ) +def try_to_import_forecast(): + try: + from statsforecast import StatsForecast # noqa: F401 + except ImportError: + raise ValueError( + """Could not import StatsForecast python package. + Please install it with `pip install statsforecast`.""" + ) + + def is_ray_available() -> bool: try: try_to_import_ray() @@ -306,6 +316,14 @@ def is_ludwig_available() -> bool: return False +def is_forecast_available() -> bool: + try: + try_to_import_forecast() + return True + except ValueError: # noqa: E722 + return False + + ############################## ## VISION ############################## diff --git a/evadb/utils/stats.py b/evadb/utils/stats.py index 7f006a0f06..333b0d6480 100644 --- a/evadb/utils/stats.py +++ b/evadb/utils/stats.py @@ -47,7 +47,7 @@ def log_elapsed_time(self, context: str): @dataclass -class UDFStats: +class FunctionStats: def __init__(self) -> None: self.num_calls: int = 0 self.timer: Timer = Timer() diff --git a/evadb/version.py b/evadb/version.py index 41695b40d7..598dc27463 100644 --- a/evadb/version.py +++ b/evadb/version.py @@ -1,6 +1,6 @@ _MAJOR = "0" _MINOR = "3" -_REVISION = "3+dev" +_REVISION = "4+dev" VERSION_SHORT = f"{_MAJOR}.{_MINOR}" VERSION = f"{_MAJOR}.{_MINOR}.{_REVISION}" diff --git a/script/formatting/formatter.py b/script/formatting/formatter.py index d4fb2ecbac..a14b45e8e3 100755 --- a/script/formatting/formatter.py +++ b/script/formatting/formatter.py @@ -447,29 +447,29 @@ def check_file(file): #LOG.info("ASPELL") for elem in Path(EvaDB_DOCS_DIR).rglob('*.*'): - if elem.suffix == ".rst": + if elem.suffix == ".rst" or elem.suffix == ".yml": os.system(f"aspell --lang=en --personal='{ignored_words_file}' check {elem}") os.system(f"aspell --lang=en --personal='{ignored_words_file}' check 'README.md'") # CODESPELL #LOG.info("Codespell") - subprocess.check_output("codespell evadb/*.py", + subprocess.check_output("codespell 'evadb/*.py'", shell=True, universal_newlines=True) - subprocess.check_output("codespell evadb/*/*.py", + subprocess.check_output("codespell 'evadb/*/*.py'", shell=True, universal_newlines=True) - subprocess.check_output("codespell docs/source/*/*.rst", + subprocess.check_output("codespell 'docs/source/*/*.rst'", shell=True, universal_newlines=True) - subprocess.check_output("codespell docs/source/*.rst", + subprocess.check_output("codespell 'docs/source/*.rst'", shell=True, universal_newlines=True) - subprocess.check_output("codespell *.md", + subprocess.check_output("codespell '*.md'", shell=True, universal_newlines=True) - subprocess.check_output("codespell evadb/*.md", + subprocess.check_output("codespell 'evadb/*.md'", shell=True, universal_newlines=True) diff --git a/script/formatting/spelling.txt b/script/formatting/spelling.txt index 6935718e61..c69cd0a4d9 100644 --- a/script/formatting/spelling.txt +++ b/script/formatting/spelling.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 1431 +personal_ws-1.1 en 1563 ABCD ABCMeta ANYDIM @@ -29,6 +29,7 @@ AbstractUDF AbstractUDFTest AggregationExpression AggregationExpressionsTest +AliExpress Anirudh AnnotateTests ApplyAndMerge @@ -51,12 +52,14 @@ AudioStorageEngine AutoEnum AutoModel AutoTokenizer +Autocommit AvgPool BDD BGR BaseException BaseModel BaseService +BatchMemSizeTest BatchNorm BatchTest BinderError @@ -70,7 +73,10 @@ BoundingBox BuildJoinExecutor ByteTracker CMDClientTest +CPUs +CRM CSV +CSVLoader CSVLoaderTest CSVReader CSVTable @@ -104,13 +110,19 @@ CommandHandlerTests CommonClauses ComparisonExpression ComparisonExpressionsTest +ConfigurationFileTests ConfigurationManager ConfigurationManager's ConfigurationManagerTests ConstantValueExpression Conv CostModel +CreateDatabase +CreateDatabaseExecutor +CreateDatabaseStatement +CreateDatabaseTest CreateExecutor +CreateFromSelectPlan CreateIndexExecutor CreateIndexPlan CreateIndexStatement @@ -131,14 +143,22 @@ CropTests CustomModel DATETIME DBAPITests +DBHandler +DBHandlerResponse +DBHandlerStatus DDL DESC DETRACVideo DFS +DailyMail DataFrame DataFrameColumn DataFrameColumns DataFrameMetadata +DataFrames +DatabaseCatalog +DatabaseCatalogEntry +DatabaseCatalogService DatasetFileNotFoundError DatasetService DecoratorTests @@ -151,6 +171,9 @@ DeleteExecutor DeleteExecutorTest DeletePlan DeleteTableStatement +DemoDB +DemoTable +DemoUDF Deserialize DiskKVCache DistributedLogicalGetToSeqScan @@ -199,6 +222,7 @@ EvaDBQuery EvaDBRelation EvaDBServerTest EvaDBTrackerAbstractUDF +EvaQL EvaServer EverNoteLoader ExchangeExecutor @@ -216,6 +240,7 @@ ExpressionReturnType ExpressionType ExpressionUtilsTest FAISS +FFFFFF FLV FaceDet FaceDetection @@ -238,6 +263,7 @@ FeaturePayload FileExistsError FileFormatType FileNotFoundError +Finetuning FlipTests ForeignKey FrameInfo @@ -258,6 +284,7 @@ GPUs GaussianBlur GaussianBlurTests GenericHuggingfaceModel +GenericLudwigModel GroupBy GroupByExecutor GroupByPlan @@ -277,11 +304,14 @@ HashJoinBuildPlan HashJoinExecutor HashJoinPlan HashJoinProbePlan +HomeRentals HorizontalFlip HuggingFace HuggingFaceModel +HuggingFaceTest HuggingFaceTests IDMap +IGNORECASE IMAGENET INFILE IOArgument @@ -289,6 +319,7 @@ IOColumnArgument IOs ImageColumnName ImageDraw +ImageFeatureExtractor ImageHFModel ImageStorageEngine ImportError @@ -296,6 +327,7 @@ IndexCatalog IndexCatalogEntry IndexCatalogService IndexCatalogServiceTest +IndexError IndexScan InputType InsertExecutor @@ -351,7 +383,9 @@ LoadDataExecutor LoadDataPlan LoadDataStatement LoadExecutorTest +LoadExecutorTests LoadMultimediaExecutor +LoadPDFExecutorTests LoggingLevel LoggingManager LogicalApplyAndMerge @@ -365,6 +399,7 @@ LogicalCreateMaterializedView LogicalCreateMaterializedViewToPhysical LogicalCreateToPhysical LogicalCreateUDF +LogicalCreateUDFFromSelectToPhysical LogicalCreateUDFToPhysical LogicalDelete LogicalDeleteToPhysical @@ -413,6 +448,7 @@ LogicalUnionToPhysical LogicalVectorIndexScan LogicalVectorIndexScanToPhysical LordDarkula +LudwigModel MLP MNIST MNISTVid @@ -428,8 +464,11 @@ MaterializedViewTest MaxPool MemeImages MemoTest +MindsDB MiniLM MnistImageClassifier +MnistVideo +ModelTrainTests ModulePathTest MyCSV MyImage @@ -437,17 +476,24 @@ MyImages MyLargeScaleImages MyMeta MyPDFs +MySQLNativeStorageEngineTest MyTextCSV MyUDF MyVideo MyVideoCSV MyVideos +MydbHandler +MysqlHandler NEQ NLP NOTNULL NaN NamedTemporaryFile Namespace +NativeExecutorTest +NativePlan +NativeQueryResponse +NativeStorageEngine NdArrayType NdarrayUDF NestedLoopJoinExecutor @@ -456,6 +502,7 @@ NestedQuery NoResultFound NorFairTracker NotImplementedError +NullPool NumPy NumpyArray OCRExtraction @@ -501,6 +548,7 @@ PPScanPlan PRs PYTHONPATH PandasDataframe +PandasQATest ParallelLogicalApplyAndMergeToPhysical ParallelLogicalGetToSeqScan ParserOrderBySortType @@ -514,9 +562,13 @@ PlanGenerator PlanNodeTests PlanOprType Popen +PostgresHandler +PostgresNativeStorageEngineTest PredicateExecutor PredicatePlan +PredictHouseRent PrivateGPTTest +ProgrammingError ProjectExecutor ProjectPlan PropertyMock @@ -537,6 +589,7 @@ QL Qdrant QdrantClient QdrantVectorStore +Quadro QuerySpecification QueueReaderExecutor REUSEADDR @@ -569,15 +622,21 @@ SNL SQLAlchemy SQLAlchemyColumns SQLAlchemyError +SQLAlchemyPlan SQLAlchemyTests SQLConfig SQLStorageEngine SQLStorageEngineTest +SQLiteHandler +SQLiteNativeStorageEngineTest SaliencyFeatureExtractor SaliencyTests SampleExecutor SampleExecutorTest SamplePlan +SampleTable +SampleVideoTable +Scalability ScanPlan SchemaUtils Segmenter @@ -628,7 +687,9 @@ Summarizer SyntaxError SystemError TAIPAI +TCP TMP +TOKENIZERS TableCatalog TableCatalogEntry TableCatalogEntryEntryEntryEntry @@ -653,9 +714,15 @@ TestOCR TestOptimizerContext TestOptimizerTask TestSuite +TestTable +TestTextHFModel TestUDF +TextFilterKeyword +TextFilteringTests TextHFModel TextLoader +TextPickleType +TextProcessing TextTestRunner TimeoutError TimerTests @@ -668,6 +735,7 @@ TopDown TopDownRewrite ToxicityClassifier TupleValueExpression +TypeDecorator TypeError TypeVar UADETRAC @@ -717,8 +785,11 @@ UnstructuredMarkdownLoader UnstructuredPowerPointLoader UnstructuredWordDocumentLoader Upserts +UseExecutor +UseStatement VARCHAR VGG +VSCode ValueError ValueList VectorIndexQuery @@ -741,6 +812,7 @@ WIP WMV WeakValueDictionary XdistTests +Xeon XformExtractObjectToLinearFlow XformLateralJoinToLinearFlow YOLO @@ -749,9 +821,11 @@ YoloDecorators YoloObjectDetection YoloTest YoloV +YoutubeChannelQATest YoutubeQATest Za aa +aaaa aaequality abc abdfdsfds @@ -765,6 +839,7 @@ addTypeEqualityFunc aenum agg aggr +ai aidb amongst analytics @@ -806,21 +881,27 @@ audioless autofunction autogenerated automethod +automl autosummary autouse avi backend +backends backoff bart batchable bavail +bb +bbbb bbox bboxes bdist benchmarkImageDataset bfs bgr +bigint binded +binderContext bool boolean bools @@ -846,6 +927,8 @@ ci classmethod classname cli +cln +clna clova cls cmd @@ -893,6 +976,7 @@ dataframe dataframes dataset datasets +datasource datatypes datefmt datetime @@ -906,6 +990,8 @@ defs del dep deque +desc +deserialization deserialize det detections @@ -920,6 +1006,7 @@ dicts dir dirname diskcache +distilbart dists distutils dml @@ -929,8 +1016,10 @@ docx doesnot donut dottedid +dq dropbox dropdown +dropna ds dst dtype @@ -965,11 +1054,13 @@ exch expr expresssion exprs +extractOne f'LOAD facebook facenet failureException faiss +faq fastRCNN fasterRCNN fasterrcnn @@ -977,6 +1068,8 @@ fastrcnn fc featCol featureextractor +feedstock +fetchall ffill ffmpeg filepath @@ -1005,6 +1098,7 @@ gaurav gaussianBlur gb gc +gdp georgia geq getLogger @@ -1026,6 +1120,8 @@ gpt gpu gpus grayscale +grcpio +greenlet groupable groupby grp @@ -1070,6 +1166,7 @@ insrt integratedTerminal integrations intp +invaid io ipynb iq @@ -1088,9 +1185,11 @@ iter iterrows itertools jarulraj +jb jpeg jpg json +ju kaushikravichandran klass kornia @@ -1115,6 +1214,7 @@ loopback lproject lsof lstrip +ludwig macos makedirs maxdepth @@ -1125,10 +1225,13 @@ memeimages metaclass metafile metainfo +mindsdb miniconda +mins mkdir mnist mnistcnn +mnistimageclassifier mnistvid mnt moto @@ -1142,6 +1245,8 @@ multiline multimodal mvit mvitactionrecognition +mxztt +mydb myimages mysql myvideo @@ -1191,6 +1296,7 @@ param params pardir parserVisitor +patcher pathlib pdf pdfs @@ -1202,10 +1308,12 @@ pkill pkl plangenerator png +poolclass popleft pos posix postgres +postgresql pple pprint ppt @@ -1223,12 +1331,14 @@ proc proj prpty psutil +psycopg pth ptn ptype pushdown px py +pymupdfs pypi pypirc pypitest @@ -1248,6 +1358,7 @@ realimport recurse reddit relu +repo repr req resnet @@ -1261,6 +1372,7 @@ rmtree roadmap roberta rollbacked +rowcount rsplit rst rstrip @@ -1268,6 +1380,7 @@ rtype ruletype runnable runtime +russia saliency samsum sdist @@ -1280,6 +1393,7 @@ setLevel setUp setUpClass setattr +setuptools sharded shutil sig @@ -1287,16 +1401,21 @@ sigs simiarity singledispatch singledispatchmethod +sk +smallint smi softmax spacy +speechrecognizer splitter +sqft sql sqlaclchemy sqlalchemy sqlengine sqlite src +sshleifer statementtoplanconverter staticmethod statvfs @@ -1351,8 +1470,10 @@ testsimilarityimagedataset testsimilaritytable th thefuzz +timm tmp toGrayscale +toc toctree todo tokenizer @@ -1361,6 +1482,7 @@ torchvision tup tv tve +tvp txt ty ua @@ -1376,6 +1498,7 @@ udfs uid uint uknown +ukraine ultralytics un unicode @@ -1387,19 +1510,26 @@ upsert upserting uri url +urllib +urlparse urls +usecases usecols utf util utils uuid +varchar venv verticalFlip vgg vid +videotable virtualenv +virtualenvs vl vr +vscode vstack wal warmup @@ -1419,8 +1549,10 @@ xformed xlm xmax xmin +xxxxxx xyxy xzdandy +xzr yaml yc ymax diff --git a/script/test/link_check_config.json b/script/test/link_check_config.json new file mode 100644 index 0000000000..cd8360d411 --- /dev/null +++ b/script/test/link_check_config.json @@ -0,0 +1,7 @@ +{ + "ignorePatterns": [ + { + "pattern": "^https://twitter.com" + } + ] +} diff --git a/script/test/test.sh b/script/test/test.sh index 591ffa82de..1e155923f2 100644 --- a/script/test/test.sh +++ b/script/test/test.sh @@ -37,14 +37,40 @@ check_linter() { print_error_code $code "LINTER" } +check_doc_build() { + pushd docs + make html + code=$? + popd + print_error_code $code "DOC BUILD" +} + +check_doc_link() { + pushd docs + make linkcheck + code=$? + popd + print_error_code $code "DOC LINK CHECK" +} + +check_readme_link() { + if command -v npm > /dev/null && command -v npx >/dev/null && npm list --depth=0 | grep markdown-link-check; then + npx markdown-link-check -c ./script/test/link_check_config.json ./README.md + code=$? + print_error_code $code "README LINK CHECK" + else + echo "README LINK CHECK: --||-- SKIPPED (missing dependency: npm install markdown-link-check)" + fi +} + unit_test() { - PYTHONPATH="." pytest test/unit_tests/ --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=evadb/ --capture=sys --tb=short -v -rsf --log-level=WARNING -m "not benchmark" + PYTHONPATH=./ pytest test/unit_tests/ --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=evadb/ --capture=sys --tb=short -v -rsf --log-level=WARNING -m "not benchmark" code=$? print_error_code $code "UNIT TEST" } short_integration_test() { - PYTHONPATH=./ python -m pytest test/integration_tests/short/ -p no:cov -m "not benchmark" + PYTHONPATH=./ pytest test/integration_tests/short/ --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=evadb/ --capture=sys --tb=short -v -rsf --log-level=WARNING -m "not benchmark" code=$? print_error_code $code "SHORT INTEGRATION TEST" } @@ -56,7 +82,7 @@ long_integration_test() { } notebook_test() { - PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" + PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb" --ignore="tutorials/14-food-review-tone-analysis-and-response.ipynb" code=$? print_error_code $code "NOTEBOOK TEST" } @@ -117,6 +143,18 @@ then check_linter fi +################################################## +## DOC BUILD TESTS +################################################## + +if [[ ( "$OSTYPE" != "msys" ) && ( "$MODE" = "DOC" || "$MODE" = "ALL" ) ]]; +then + # Run black, isort, linter + check_doc_build + check_doc_link + check_readme_link +fi + ################################################## ## UNIT TESTS ################################################## @@ -176,8 +214,7 @@ fi ## based on Python version ################################################## -if [[ ( "$PYTHON_VERSION" = "3.10" ) && - ( "$MODE" = "FULL" ) ]]; +if [[ ( "$PYTHON_VERSION" = "3.10" ) ]] then echo "UPLOADING COVERAGE REPORT" coveralls diff --git a/setup.py b/setup.py index 696c3bf294..7db804fe24 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ # if sys.version_info < (3, 8): # sys.exit("Python 3.8 or later is required.") + def read(path, encoding="utf-8"): path = os.path.join(os.path.dirname(__file__), path) with io.open(path, encoding=encoding) as fp: @@ -52,9 +53,9 @@ def read(path, encoding="utf-8"): "aenum>=2.2.0", "diskcache>=5.4.0", "retry>=0.9.2", - "pydantic<2", # ray-project/ray#37019. + "pydantic<2", # ray-project/ray#37019. "psutil", - "thefuzz" + "thefuzz", ] vision_libs = [ @@ -74,7 +75,7 @@ def read(path, encoding="utf-8"): "transformers", # HUGGINGFACE "langchain", # DATA LOADERS "faiss-cpu", # DEFAULT VECTOR INDEX - "pymupdf<1.23.0", # pymupdf/PyMuPDF#2617 and pymupdf/PyMuPDF#2614 + "pymupdf<1.23.0", # pymupdf/PyMuPDF#2617 and pymupdf/PyMuPDF#2614 "pdfminer.six", "sentence-transformers", "protobuf", @@ -84,7 +85,7 @@ def read(path, encoding="utf-8"): "sentencepiece", # TRANSFORMERS ] -udf_libs = [ +function_libs = [ "facenet-pytorch>=2.5.2", # FACE DETECTION "pytube", # YOUTUBE QA APP "youtube-transcript-api", # YOUTUBE QA APP @@ -105,16 +106,16 @@ def read(path, encoding="utf-8"): "nest-asyncio>=1.5.6", ] -qdrant_libs = [ - "qdrant_client" # cannot install on 3.11 due to grcpio -] +qdrant_libs = ["qdrant_client"] # cannot install on 3.11 due to grcpio postgres_libs = [ "psycopg2", ] -ludwig_libs = [ - "ludwig[hyperopt,distributed]" # MODEL TRAIN AND FINE TUNING +ludwig_libs = ["ludwig[hyperopt,distributed]"] # MODEL TRAIN AND FINE TUNING + +forecasting_libs = [ + "statsforecast" # MODEL TRAIN AND FINE TUNING ] ### NEEDED FOR DEVELOPER TESTING ONLY @@ -148,13 +149,14 @@ def read(path, encoding="utf-8"): "ray": ray_libs, "vision": vision_libs, "document": document_libs, - "udf": udf_libs, + "function": function_libs, "notebook": notebook_libs, "qdrant": qdrant_libs, "postgres": postgres_libs, "ludwig": ludwig_libs, - # everything except ray, qdrant and postgres - "dev": dev_libs + vision_libs + document_libs + udf_libs + notebook_libs, + "forecasting": forecasting_libs, + # everything except ray, qdrant, ludwig and postgres. The first three fail on pyhton 3.11. + "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs, } setup( @@ -188,5 +190,11 @@ def read(path, encoding="utf-8"): install_requires=INSTALL_REQUIRES, extras_require=EXTRA_REQUIRES, include_package_data=True, - package_data={"evadb": ["evadb.yml", "parser/evadb.lark"]}, + package_data={ + "evadb": [ + "evadb.yml", + "parser/evadb.lark", + "third_party/databases/**/requirements.txt", + ] + }, ) diff --git a/test/benchmark_tests/conftest.py b/test/benchmark_tests/conftest.py index ec55e29774..8ad336c16b 100644 --- a/test/benchmark_tests/conftest.py +++ b/test/benchmark_tests/conftest.py @@ -16,8 +16,8 @@ import pytest +from evadb.functions.function_bootstrap_queries import init_builtin_functions from evadb.server.command_handler import execute_query_fetch_all -from evadb.udfs.udf_bootstrap_queries import init_builtin_udfs @pytest.fixture(autouse=False) @@ -31,5 +31,5 @@ def setup_pytorch_tests(): execute_query_fetch_all( evadb, "LOAD VIDEO 'data/sample_videos/touchdown.mp4' INTO VIDEOS" ) - init_builtin_udfs(evadb, mode="release") + init_builtin_functions(evadb, mode="release") return evadb diff --git a/test/benchmark_tests/test_benchmark_pytorch.py b/test/benchmark_tests/test_benchmark_pytorch.py index de6a01439c..b9300cfd66 100644 --- a/test/benchmark_tests/test_benchmark_pytorch.py +++ b/test/benchmark_tests/test_benchmark_pytorch.py @@ -109,7 +109,7 @@ def test_automatic_speech_recognition(benchmark, setup_pytorch_tests): udf_name = "SpeechRecognizer" create_udf = ( f"CREATE UDF {udf_name} TYPE HuggingFace " - "'task' 'automatic-speech-recognition' 'model' 'openai/whisper-base';" + "TASK 'automatic-speech-recognition' MODEL 'openai/whisper-base';" ) execute_query_fetch_all(setup_pytorch_tests, create_udf) @@ -135,14 +135,14 @@ def test_summarization_from_video(benchmark, setup_pytorch_tests): asr_udf = "SpeechRecognizer" create_udf = ( f"CREATE UDF {asr_udf} TYPE HuggingFace " - "'task' 'automatic-speech-recognition' 'model' 'openai/whisper-base';" + "TASK 'automatic-speech-recognition' MODEL 'openai/whisper-base';" ) execute_query_fetch_all(setup_pytorch_tests, create_udf) summary_udf = "Summarizer" create_udf = ( f"CREATE UDF {summary_udf} TYPE HuggingFace " - "'task' 'summarization' 'model' 'philschmid/bart-large-cnn-samsum' 'min_length' 10 'max_length' 100;" + "TASK 'summarization' MODEL 'philschmid/bart-large-cnn-samsum' MIN_LENGTH 10 MAX_LENGTH 100;" ) execute_query_fetch_all(setup_pytorch_tests, create_udf) diff --git a/test/integration_tests/long/udfs/__init__.py b/test/integration_tests/long/functions/__init__.py similarity index 100% rename from test/integration_tests/long/udfs/__init__.py rename to test/integration_tests/long/functions/__init__.py diff --git a/test/integration_tests/long/functions/ndarray/__init__.py b/test/integration_tests/long/functions/ndarray/__init__.py new file mode 100644 index 0000000000..7e0df17385 --- /dev/null +++ b/test/integration_tests/long/functions/ndarray/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""user defined test functions operating on ndarrays functions""" diff --git a/test/integration_tests/long/udfs/ndarray/test_annotate.py b/test/integration_tests/long/functions/ndarray/test_annotate.py similarity index 97% rename from test/integration_tests/long/udfs/ndarray/test_annotate.py rename to test/integration_tests/long/functions/ndarray/test_annotate.py index 6e548fba09..b70b3f7196 100644 --- a/test/integration_tests/long/udfs/ndarray/test_annotate.py +++ b/test/integration_tests/long/functions/ndarray/test_annotate.py @@ -19,7 +19,7 @@ from numpy import asarray from evadb.configuration.constants import EvaDB_ROOT_DIR -from evadb.udfs.ndarray.annotate import Annotate +from evadb.functions.ndarray.annotate import Annotate from evadb.utils.generic_utils import try_to_import_pillow diff --git a/test/integration_tests/long/udfs/ndarray/test_array_count.py b/test/integration_tests/long/functions/ndarray/test_array_count.py similarity index 93% rename from test/integration_tests/long/udfs/ndarray/test_array_count.py rename to test/integration_tests/long/functions/ndarray/test_array_count.py index 9c43d1500b..70b78a12ef 100644 --- a/test/integration_tests/long/udfs/ndarray/test_array_count.py +++ b/test/integration_tests/long/functions/ndarray/test_array_count.py @@ -14,7 +14,7 @@ # limitations under the License. import unittest -from evadb.udfs.ndarray.array_count import ArrayCount +from evadb.functions.ndarray.array_count import ArrayCount class CropTests(unittest.TestCase): diff --git a/test/integration_tests/long/udfs/ndarray/test_crop.py b/test/integration_tests/long/functions/ndarray/test_crop.py similarity index 98% rename from test/integration_tests/long/udfs/ndarray/test_crop.py rename to test/integration_tests/long/functions/ndarray/test_crop.py index d99cb50c36..f8c52397b1 100644 --- a/test/integration_tests/long/udfs/ndarray/test_crop.py +++ b/test/integration_tests/long/functions/ndarray/test_crop.py @@ -17,7 +17,7 @@ import numpy as np import pandas as pd -from evadb.udfs.ndarray.crop import Crop +from evadb.functions.ndarray.crop import Crop class CropTests(unittest.TestCase): diff --git a/test/integration_tests/long/udfs/ndarray/test_flips.py b/test/integration_tests/long/functions/ndarray/test_flips.py similarity index 94% rename from test/integration_tests/long/udfs/ndarray/test_flips.py rename to test/integration_tests/long/functions/ndarray/test_flips.py index 90b239fc5d..c117d610f3 100644 --- a/test/integration_tests/long/udfs/ndarray/test_flips.py +++ b/test/integration_tests/long/functions/ndarray/test_flips.py @@ -19,8 +19,8 @@ from numpy import asarray from evadb.configuration.constants import EvaDB_ROOT_DIR -from evadb.udfs.ndarray.horizontal_flip import HorizontalFlip -from evadb.udfs.ndarray.vertical_flip import VerticalFlip +from evadb.functions.ndarray.horizontal_flip import HorizontalFlip +from evadb.functions.ndarray.vertical_flip import VerticalFlip from evadb.utils.generic_utils import try_to_import_pillow diff --git a/test/integration_tests/long/udfs/ndarray/test_gaussian_blur.py b/test/integration_tests/long/functions/ndarray/test_gaussian_blur.py similarity index 82% rename from test/integration_tests/long/udfs/ndarray/test_gaussian_blur.py rename to test/integration_tests/long/functions/ndarray/test_gaussian_blur.py index cfcb47da4c..27f5000f1d 100644 --- a/test/integration_tests/long/udfs/ndarray/test_gaussian_blur.py +++ b/test/integration_tests/long/functions/ndarray/test_gaussian_blur.py @@ -20,14 +20,14 @@ import pandas as pd from evadb.configuration.constants import EvaDB_ROOT_DIR -from evadb.udfs.ndarray.gaussian_blur import GaussianBlur +from evadb.functions.ndarray.gaussian_blur import GaussianBlur from evadb.utils.generic_utils import try_to_import_cv2 class GaussianBlurTests(unittest.TestCase): def setUp(self): self.gb_instance = GaussianBlur() - self.tmp_file = f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/tmp.jpeg" + self.tmp_file = f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/tmp.jpeg" def test_gb_name_exists(self): assert hasattr(self.gb_instance, "name") @@ -36,7 +36,7 @@ def test_should_blur_image(self): try_to_import_cv2() import cv2 - arr = cv2.imread(f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/dog.jpeg") + arr = cv2.imread(f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/dog.jpeg") df = pd.DataFrame([[arr]]) modified_arr = self.gb_instance(df)["blurred_frame_array"] cv2.imwrite( @@ -46,7 +46,7 @@ def test_should_blur_image(self): actual_array = cv2.imread(self.tmp_file) expected_array = cv2.imread( - f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/blurred_dog.jpeg" + f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/blurred_dog.jpeg" ) self.assertEqual(np.sum(actual_array - expected_array), 0) file_remove(Path(self.tmp_file)) diff --git a/test/integration_tests/long/udfs/ndarray/test_open.py b/test/integration_tests/long/functions/ndarray/test_open.py similarity index 98% rename from test/integration_tests/long/udfs/ndarray/test_open.py rename to test/integration_tests/long/functions/ndarray/test_open.py index defc9c2b55..fd9c4b0e62 100644 --- a/test/integration_tests/long/udfs/ndarray/test_open.py +++ b/test/integration_tests/long/functions/ndarray/test_open.py @@ -20,7 +20,7 @@ import pytest from mock import patch -from evadb.udfs.ndarray.open import Open +from evadb.functions.ndarray.open import Open from evadb.utils.generic_utils import try_to_import_cv2 diff --git a/test/integration_tests/long/udfs/ndarray/test_to_grayscale.py b/test/integration_tests/long/functions/ndarray/test_to_grayscale.py similarity index 75% rename from test/integration_tests/long/udfs/ndarray/test_to_grayscale.py rename to test/integration_tests/long/functions/ndarray/test_to_grayscale.py index e8bccb11a1..eaa13b3dea 100644 --- a/test/integration_tests/long/udfs/ndarray/test_to_grayscale.py +++ b/test/integration_tests/long/functions/ndarray/test_to_grayscale.py @@ -20,7 +20,7 @@ import pandas as pd from evadb.configuration.constants import EvaDB_ROOT_DIR -from evadb.udfs.ndarray.to_grayscale import ToGrayscale +from evadb.functions.ndarray.to_grayscale import ToGrayscale from evadb.utils.generic_utils import try_to_import_cv2 @@ -35,17 +35,17 @@ def test_should_convert_to_grayscale(self): try_to_import_cv2() import cv2 - arr = cv2.imread(f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/dog.jpeg") + arr = cv2.imread(f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/dog.jpeg") df = pd.DataFrame([[arr]]) modified_arr = self.to_grayscale_instance(df)["grayscale_frame_array"] cv2.imwrite( - f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/tmp.jpeg", modified_arr[0] + f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/tmp.jpeg", modified_arr[0] ) actual_array = cv2.imread( - f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/tmp.jpeg" + f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/tmp.jpeg" ) expected_arr = cv2.imread( - f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/grayscale_dog.jpeg" + f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/grayscale_dog.jpeg" ) self.assertEqual(np.sum(actual_array - expected_arr), 0) - file_remove(Path(f"{EvaDB_ROOT_DIR}/test/unit_tests/udfs/data/tmp.jpeg")) + file_remove(Path(f"{EvaDB_ROOT_DIR}/test/unit_tests/functions/data/tmp.jpeg")) diff --git a/test/integration_tests/long/udfs/test_chatgpt.py b/test/integration_tests/long/functions/test_chatgpt.py similarity index 84% rename from test/integration_tests/long/udfs/test_chatgpt.py rename to test/integration_tests/long/functions/test_chatgpt.py index aca6e0d5fc..b72612d050 100644 --- a/test/integration_tests/long/udfs/test_chatgpt.py +++ b/test/integration_tests/long/functions/test_chatgpt.py @@ -58,15 +58,15 @@ def tearDown(self) -> None: execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyTextCSV;") @chatgpt_skip_marker - def test_openai_chat_completion_udf(self): - udf_name = "OpenAIChatCompletion" - execute_query_fetch_all(self.evadb, f"DROP UDF IF EXISTS {udf_name};") + def test_openai_chat_completion_function(self): + function_name = "OpenAIChatCompletion" + execute_query_fetch_all(self.evadb, f"DROP FUNCTION IF EXISTS {function_name};") - create_udf_query = f"""CREATE UDF IF NOT EXISTS{udf_name} - IMPL 'evadb/udfs/chatgpt.py'; + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS{function_name} + IMPL 'evadb/functions/chatgpt.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) - gpt_query = f"SELECT {udf_name}('summarize', content) FROM MyTextCSV;" + gpt_query = f"SELECT {function_name}('summarize', content) FROM MyTextCSV;" output_batch = execute_query_fetch_all(self.evadb, gpt_query) self.assertEqual(output_batch.columns, ["openaichatcompletion.response"]) diff --git a/test/integration_tests/long/udfs/test_emotion_detector.py b/test/integration_tests/long/functions/test_emotion_detector.py similarity index 96% rename from test/integration_tests/long/udfs/test_emotion_detector.py rename to test/integration_tests/long/functions/test_emotion_detector.py index 433b3b9796..ea6824de1c 100644 --- a/test/integration_tests/long/udfs/test_emotion_detector.py +++ b/test/integration_tests/long/functions/test_emotion_detector.py @@ -38,7 +38,7 @@ def _load_image(self, path): @unittest.skip("disable test due to model downloading time") def test_should_return_correct_emotion(self): - from evadb.udfs.emotion_detector import EmotionDetector + from evadb.functions.emotion_detector import EmotionDetector happy_img = self.base_path / "happy.jpg" sad_img = self.base_path / "sad.jpg" diff --git a/test/integration_tests/long/udfs/test_facenet_udf.py b/test/integration_tests/long/functions/test_facenet_udf.py similarity index 95% rename from test/integration_tests/long/udfs/test_facenet_udf.py rename to test/integration_tests/long/functions/test_facenet_udf.py index 76a36a401a..c54c2418e2 100644 --- a/test/integration_tests/long/udfs/test_facenet_udf.py +++ b/test/integration_tests/long/functions/test_facenet_udf.py @@ -40,7 +40,7 @@ def _load_image(self, path): @windows_skip_marker def test_should_return_batches_equivalent_to_number_of_frames(self): - from evadb.udfs.face_detector import FaceDetector + from evadb.functions.face_detector import FaceDetector single_face_img = Path("data/facenet/one.jpg") multi_face_img = Path("data/facenet/multiface.jpg") @@ -65,7 +65,7 @@ def test_should_return_batches_equivalent_to_number_of_frames(self): @unittest.skip("Needs GPU") def test_should_run_on_gpu(self): - from evadb.udfs.face_detector import FaceDetector + from evadb.functions.face_detector import FaceDetector single_face_img = Path("data/facenet/one.jpg") frame_single_face = { diff --git a/test/integration_tests/long/udfs/test_fastrcnn_object_detector.py b/test/integration_tests/long/functions/test_fastrcnn_object_detector.py similarity index 95% rename from test/integration_tests/long/udfs/test_fastrcnn_object_detector.py rename to test/integration_tests/long/functions/test_fastrcnn_object_detector.py index a97ee3857d..2e587bfaca 100644 --- a/test/integration_tests/long/udfs/test_fastrcnn_object_detector.py +++ b/test/integration_tests/long/functions/test_fastrcnn_object_detector.py @@ -37,7 +37,7 @@ def _load_image(self, path): @unittest.skip("disable test due to model downloading time") def test_should_return_batches_equivalent_to_number_of_frames(self): - from evadb.udfs.fastrcnn_object_detector import FastRCNNObjectDetector + from evadb.functions.fastrcnn_object_detector import FastRCNNObjectDetector frame_dog = { "id": 1, diff --git a/test/integration_tests/long/udfs/test_hugging_face.py b/test/integration_tests/long/functions/test_hugging_face.py similarity index 87% rename from test/integration_tests/long/udfs/test_hugging_face.py rename to test/integration_tests/long/functions/test_hugging_face.py index a2e537a8b6..550d178aa0 100644 --- a/test/integration_tests/long/udfs/test_hugging_face.py +++ b/test/integration_tests/long/functions/test_hugging_face.py @@ -25,7 +25,7 @@ class TestTextHFModel(TextHFModel): @property def default_pipeline_args(self) -> dict: # We need to improve the hugging face interface, passing - # UdfCatalogEntry into UDF is not ideal. + # FunctionCatalogEntry into Function is not ideal. return { "task": "summarization", "model": "sshleifer/distilbart-cnn-12-6", @@ -36,9 +36,9 @@ def default_pipeline_args(self) -> dict: class HuggingFaceTest(unittest.TestCase): def test_hugging_face_with_large_input(self): - udf_obj = MagicMock() - udf_obj.metadata = [] - text_summarization_model = TestTextHFModel(udf_obj) + function_obj = MagicMock() + function_obj.metadata = [] + text_summarization_model = TestTextHFModel(function_obj) large_text = pd.DataFrame([{"text": "hello" * 4096}]) try: diff --git a/test/integration_tests/long/udfs/test_yolo_object_detector.py b/test/integration_tests/long/functions/test_yolo_object_detector.py similarity index 93% rename from test/integration_tests/long/udfs/test_yolo_object_detector.py rename to test/integration_tests/long/functions/test_yolo_object_detector.py index 12ddd974fb..0c68eb167d 100644 --- a/test/integration_tests/long/udfs/test_yolo_object_detector.py +++ b/test/integration_tests/long/functions/test_yolo_object_detector.py @@ -54,7 +54,7 @@ def _load_image(self, path): def test_should_raise_import_error_with_missing_torch(self): with self.assertRaises(ImportError): with mock.patch.dict(sys.modules, {"torch": None}): - from evadb.udfs.decorators.yolo_object_detection_decorators import ( # noqa: F401 + from evadb.functions.decorators.yolo_object_detection_decorators import ( # noqa: F401 Yolo, ) @@ -62,7 +62,7 @@ def test_should_raise_import_error_with_missing_torch(self): @unittest.skip("disable test due to model downloading time") def test_should_return_batches_equivalent_to_number_of_frames(self): - from evadb.udfs.decorators.yolo_object_detection_decorators import Yolo + from evadb.functions.decorators.yolo_object_detection_decorators import Yolo frame_dog = { "id": 1, diff --git a/test/integration_tests/long/interfaces/relational/test_relational_api.py b/test/integration_tests/long/interfaces/relational/test_relational_api.py index c1bed35cfd..773607960b 100644 --- a/test/integration_tests/long/interfaces/relational/test_relational_api.py +++ b/test/integration_tests/long/interfaces/relational/test_relational_api.py @@ -17,7 +17,7 @@ from test.util import ( DummyObjectDetector, create_sample_video, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, suffix_pytest_xdist_worker_id_to_dir, ) @@ -47,7 +47,7 @@ def setUpClass(cls): def setUp(self): self.evadb.catalog().reset() self.mnist_path = f"{EvaDB_ROOT_DIR}/data/mnist/mnist.mp4" - load_udfs_for_testing( + load_functions_for_testing( self.evadb, ) self.images = f"{EvaDB_ROOT_DIR}/data/detoxify/*.jpg" @@ -174,10 +174,10 @@ def test_create_index(self): ) rel.execute() - # todo support register udf + # todo support register function cursor.query( - f"""CREATE UDF IF NOT EXISTS SiftFeatureExtractor - IMPL '{EvaDB_ROOT_DIR}/evadb/udfs/sift_feature_extractor.py'""" + f"""CREATE FUNCTION IF NOT EXISTS SiftFeatureExtractor + IMPL '{EvaDB_ROOT_DIR}/evadb/functions/sift_feature_extractor.py'""" ).df() # create a vector index using QDRANT @@ -206,7 +206,7 @@ def test_create_index(self): ) assert_frame_equal(rel.df(), cursor.query(similarity_sql).df()) - def test_create_udf_with_relational_api(self): + def test_create_function_with_relational_api(self): video_file_path = create_sample_video(10) cursor = self.conn.cursor() @@ -218,34 +218,34 @@ def test_create_udf_with_relational_api(self): ) rel.execute() - create_dummy_object_detector_udf = cursor.create_function( + create_dummy_object_detector_function = cursor.create_function( "DummyObjectDetector", if_not_exists=True, impl_path="test/util.py" ) - create_dummy_object_detector_udf.execute() + create_dummy_object_detector_function.execute() args = {"task": "automatic-speech-recognition", "model": "openai/whisper-base"} - create_speech_recognizer_udf_if_not_exists = cursor.create_function( + create_speech_recognizer_function_if_not_exists = cursor.create_function( "SpeechRecognizer", if_not_exists=True, type="HuggingFace", **args ) - query = create_speech_recognizer_udf_if_not_exists.sql_query() + query = create_speech_recognizer_function_if_not_exists.sql_query() self.assertEqual( query, - """CREATE UDF IF NOT EXISTS SpeechRecognizer TYPE HuggingFace 'task' 'automatic-speech-recognition' 'model' 'openai/whisper-base'""", + """CREATE FUNCTION IF NOT EXISTS SpeechRecognizer TYPE HuggingFace TASK 'automatic-speech-recognition' MODEL 'openai/whisper-base'""", ) - create_speech_recognizer_udf_if_not_exists.execute() + create_speech_recognizer_function_if_not_exists.execute() - # check if next create call of same UDF raises error - create_speech_recognizer_udf = cursor.create_function( + # check if next create call of same Function raises error + create_speech_recognizer_function = cursor.create_function( "SpeechRecognizer", if_not_exists=False, type="HuggingFace", **args ) - query = create_speech_recognizer_udf.sql_query() + query = create_speech_recognizer_function.sql_query() self.assertEqual( query, - "CREATE UDF SpeechRecognizer TYPE HuggingFace 'task' 'automatic-speech-recognition' 'model' 'openai/whisper-base'", + "CREATE FUNCTION SpeechRecognizer TYPE HuggingFace TASK 'automatic-speech-recognition' MODEL 'openai/whisper-base'", ) with self.assertRaises(ExecutorError): - create_speech_recognizer_udf.execute() + create_speech_recognizer_function.execute() select_query_sql = ( "SELECT id, DummyObjectDetector(data) FROM dummy_video ORDER BY id;" @@ -274,17 +274,17 @@ def test_drop_with_relational_api(self): ) rel.execute() - # Create dummy udf - create_dummy_object_detector_udf = cursor.create_function( + # Create dummy function + create_dummy_object_detector_function = cursor.create_function( "DummyObjectDetector", if_not_exists=True, impl_path="test/util.py" ) - create_dummy_object_detector_udf.execute() + create_dummy_object_detector_function.execute() - # drop dummy udf - drop_dummy_object_detector_udf = cursor.drop_function( + # drop dummy function + drop_dummy_object_detector_function = cursor.drop_function( "DummyObjectDetector", if_exists=True ) - drop_dummy_object_detector_udf.execute() + drop_dummy_object_detector_function.execute() # Check if deleted successfully select_query_sql = ( @@ -293,18 +293,18 @@ def test_drop_with_relational_api(self): with self.assertRaises(BinderError): cursor.query(select_query_sql).execute() - # drop non existing udf with if_exists=True should not raise error - drop_dummy_object_detector_udf = cursor.drop_function( + # drop non existing function with if_exists=True should not raise error + drop_dummy_object_detector_function = cursor.drop_function( "DummyObjectDetector", if_exists=True ) - drop_dummy_object_detector_udf.execute() + drop_dummy_object_detector_function.execute() # if_exists=False should raise error - drop_dummy_object_detector_udf = cursor.drop_function( + drop_dummy_object_detector_function = cursor.drop_function( "DummyObjectDetector", if_exists=False ) with self.assertRaises(ExecutorError): - drop_dummy_object_detector_udf.execute() + drop_dummy_object_detector_function.execute() # drop existing table drop_table = cursor.drop_table("dummy_video", if_exists=True) @@ -332,14 +332,14 @@ def test_pdf_similarity_search(self): load_pdf = cursor.load(file_regex=pdf_path, format="PDF", table_name="PDFs") load_pdf.execute() - udf_check = cursor.drop_function("SentenceFeatureExtractor") - udf_check.df() - udf = cursor.create_function( + function_check = cursor.drop_function("SentenceFeatureExtractor") + function_check.df() + function = cursor.create_function( "SentenceFeatureExtractor", True, - f"{EvaDB_ROOT_DIR}/evadb/udfs/sentence_feature_extractor.py", + f"{EvaDB_ROOT_DIR}/evadb/functions/sentence_feature_extractor.py", ) - udf.execute() + function.execute() cursor.create_vector_index( "faiss_index", diff --git a/test/integration_tests/long/test_array_count.py b/test/integration_tests/long/test_array_count.py index aed7aa9574..73dbff1d4f 100644 --- a/test/integration_tests/long/test_array_count.py +++ b/test/integration_tests/long/test_array_count.py @@ -18,7 +18,7 @@ create_sample_video, file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -38,7 +38,7 @@ def setUpClass(cls): video_file_path = create_sample_video(NUM_FRAMES) load_query = f"LOAD VIDEO '{video_file_path}' INTO MyVideo;" execute_query_fetch_all(cls.evadb, load_query) - load_udfs_for_testing(cls.evadb, mode="debug") + load_functions_for_testing(cls.evadb, mode="debug") @classmethod def tearDownClass(cls): @@ -48,7 +48,7 @@ def tearDownClass(cls): # integration test - def test_should_load_and_select_using_udf_video(self): + def test_should_load_and_select_using_function_video(self): # Equality test select_query = "SELECT id,DummyObjectDetector(data) FROM MyVideo \ WHERE DummyObjectDetector(data).label = ['person'] ORDER BY id;" diff --git a/test/integration_tests/long/test_create_index_executor.py b/test/integration_tests/long/test_create_index_executor.py index f81af9da22..9444f414db 100644 --- a/test/integration_tests/long/test_create_index_executor.py +++ b/test/integration_tests/long/test_create_index_executor.py @@ -15,7 +15,7 @@ import unittest from pathlib import Path from test.markers import macos_skip_marker -from test.util import get_evadb_for_testing, load_udfs_for_testing +from test.util import get_evadb_for_testing, load_functions_for_testing import numpy as np import pandas as pd @@ -39,7 +39,7 @@ def _index_save_path(self): @classmethod def setUpClass(cls): cls.evadb = get_evadb_for_testing() - load_udfs_for_testing(cls.evadb, mode="debug") + load_functions_for_testing(cls.evadb, mode="debug") # Create feature vector table and raw input table. feat1 = np.array([[0, 0, 0]]).astype(np.float32) @@ -112,7 +112,7 @@ def test_should_create_index_faiss(self): self._index_save_path(), ) self.assertEqual( - index_catalog_entry.udf_signature, + index_catalog_entry.function_signature, None, ) @@ -137,11 +137,11 @@ def test_should_create_index_faiss(self): self.evadb.catalog().drop_index_catalog_entry("testCreateIndexName") @macos_skip_marker - def test_should_create_index_with_udf(self): + def test_should_create_index_with_function(self): query = "CREATE INDEX testCreateIndexName ON testCreateIndexInputTable (DummyFeatureExtractor(input)) USING FAISS;" execute_query_fetch_all(self.evadb, query) - # Test index udf signature. + # Test index function signature. index_catalog_entry = self.evadb.catalog().get_index_catalog_entry_by_name( "testCreateIndexName" ) diff --git a/test/integration_tests/long/test_create_table_executor.py b/test/integration_tests/long/test_create_table_executor.py index 87b9e64d8c..40803f7678 100644 --- a/test/integration_tests/long/test_create_table_executor.py +++ b/test/integration_tests/long/test_create_table_executor.py @@ -19,7 +19,7 @@ create_sample_video, file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -45,7 +45,7 @@ def setUpClass(cls): execute_query_fetch_all(cls.evadb, load_query) ua_detrac = f"{EvaDB_ROOT_DIR}/data/ua_detrac/ua_detrac.mp4" execute_query_fetch_all(cls.evadb, f"LOAD VIDEO '{ua_detrac}' INTO UATRAC;") - load_udfs_for_testing(cls.evadb) + load_functions_for_testing(cls.evadb) @classmethod def tearDownClass(cls): diff --git a/test/integration_tests/long/test_delete_executor.py b/test/integration_tests/long/test_delete_executor.py index 5c87180f52..e7a553d808 100644 --- a/test/integration_tests/long/test_delete_executor.py +++ b/test/integration_tests/long/test_delete_executor.py @@ -16,7 +16,7 @@ from test.util import ( file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -32,7 +32,7 @@ class DeleteExecutorTest(unittest.TestCase): def setUp(self): self.evadb = get_evadb_for_testing() self.evadb.catalog().reset() - load_udfs_for_testing(self.evadb, mode="debug") + load_functions_for_testing(self.evadb, mode="debug") create_table_query = """ CREATE TABLE IF NOT EXISTS testDeleteOne diff --git a/test/integration_tests/long/test_error_handling_with_ray.py b/test/integration_tests/long/test_error_handling_with_ray.py index bf2fdf730f..da134b7ed3 100644 --- a/test/integration_tests/long/test_error_handling_with_ray.py +++ b/test/integration_tests/long/test_error_handling_with_ray.py @@ -21,7 +21,7 @@ create_sample_image, get_evadb_for_testing, is_ray_stage_running, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -34,8 +34,8 @@ def setUp(self): self.evadb = get_evadb_for_testing() os.environ["ray"] = str(self.evadb.config.get_value("experimental", "ray")) self.evadb.catalog().reset() - # Load built-in UDFs. - load_udfs_for_testing(self.evadb, mode="debug") + # Load built-in Functions. + load_functions_for_testing(self.evadb, mode="debug") # Deliberately create a faulty path. img_path = create_sample_image() @@ -55,13 +55,13 @@ def tearDown(self): @ray_skip_marker def test_ray_error_populate_to_all_stages(self): - udf_name, task = "HFObjectDetector", "image-classification" - create_udf_query = f"""CREATE UDF {udf_name} + function_name, task = "HFObjectDetector", "image-classification" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' '{task}' + TASK '{task}' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT HFObjectDetector(data) FROM testRayErrorHandling;""" diff --git a/test/integration_tests/long/test_explain_executor.py b/test/integration_tests/long/test_explain_executor.py index a504eaea79..9d2c8e8ca5 100644 --- a/test/integration_tests/long/test_explain_executor.py +++ b/test/integration_tests/long/test_explain_executor.py @@ -17,7 +17,7 @@ create_sample_video, file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, ) import pytest @@ -43,7 +43,7 @@ def setUpClass(cls): video_file_path = create_sample_video(NUM_FRAMES) load_query = f"LOAD VIDEO '{video_file_path}' INTO MyVideo;" execute_query_fetch_all(cls.evadb, load_query) - load_udfs_for_testing(cls.evadb, mode="debug") + load_functions_for_testing(cls.evadb, mode="debug") @classmethod def tearDownClass(cls): diff --git a/test/integration_tests/long/test_udf_executor.py b/test/integration_tests/long/test_function_executor.py similarity index 65% rename from test/integration_tests/long/test_udf_executor.py rename to test/integration_tests/long/test_function_executor.py index 3a07939710..28368ddad8 100644 --- a/test/integration_tests/long/test_udf_executor.py +++ b/test/integration_tests/long/test_function_executor.py @@ -37,7 +37,7 @@ @pytest.mark.notparallel -class UDFExecutorTest(unittest.TestCase): +class FunctionExecutorTest(unittest.TestCase): def setUp(self): self.evadb = get_evadb_for_testing() self.evadb.catalog().reset() @@ -45,13 +45,13 @@ def setUp(self): load_query = f"LOAD VIDEO '{video_file_path}' INTO MyVideo;" execute_query_fetch_all(self.evadb, load_query) - create_udf_query = """CREATE UDF DummyObjectDetector + create_function_query = """CREATE FUNCTION DummyObjectDetector INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) def tearDown(self): shutdown_ray() @@ -60,7 +60,7 @@ def tearDown(self): # integration test - def test_should_load_and_select_using_udf_video_in_table(self): + def test_should_load_and_select_using_function_video_in_table(self): select_query = "SELECT id,DummyObjectDetector(data) FROM MyVideo \ ORDER BY id;" actual_batch = execute_query_fetch_all(self.evadb, select_query) @@ -75,7 +75,7 @@ def test_should_load_and_select_using_udf_video_in_table(self): expected_batch = Batch(frames=pd.DataFrame(expected)) self.assertEqual(actual_batch, expected_batch) - def test_should_load_and_select_using_udf_video(self): + def test_should_load_and_select_using_function_video(self): # Equality test select_query = "SELECT id,DummyObjectDetector(data) FROM MyVideo \ WHERE DummyObjectDetector(data).label = ['person'] ORDER BY id;" @@ -140,81 +140,92 @@ def test_should_load_and_select_using_udf_video(self): expected_batch.modify_column_alias("T") self.assertEqual(actual_batch, expected_batch) - def test_create_udf(self): - udf_name = "DummyObjectDetector" - create_udf_query = """CREATE UDF {} + def test_create_function(self): + function_name = "DummyObjectDetector" + create_function_query = """CREATE FUNCTION {} INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py'; """ - # Try to create duplicate UDF + # Try to create duplicate FUNCTION with self.assertRaises(ExecutorError): actual = execute_query_fetch_all( - self.evadb, create_udf_query.format(udf_name) + self.evadb, create_function_query.format(function_name) + ) + expected = Batch( + pd.DataFrame([f"Function {function_name} already exists."]) ) - expected = Batch(pd.DataFrame([f"UDF {udf_name} already exists."])) self.assertEqual(actual, expected) - # Try to create UDF if not exists + # Try to create FUNCTION if not exists actual = execute_query_fetch_all( - self.evadb, create_udf_query.format("IF NOT EXISTS " + udf_name) + self.evadb, create_function_query.format("IF NOT EXISTS " + function_name) ) expected = Batch( - pd.DataFrame([f"UDF {udf_name} already exists, nothing added."]) + pd.DataFrame([f"Function {function_name} already exists, nothing added."]) ) self.assertEqual(actual, expected) - def test_should_create_udf_with_metadata(self): - udf_name = "DummyObjectDetector" - execute_query_fetch_all(self.evadb, f"DROP UDF {udf_name};") - create_udf_query = """CREATE UDF {} + def test_should_create_function_with_metadata(self): + function_name = "DummyObjectDetector" + execute_query_fetch_all(self.evadb, f"DROP FUNCTION {function_name};") + create_function_query = """CREATE FUNCTION {} INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py' - 'CACHE' 'TRUE' - 'BATCH' 'FALSE'; + CACHE 'TRUE' + BATCH 'FALSE'; """ - execute_query_fetch_all(self.evadb, create_udf_query.format(udf_name)) + execute_query_fetch_all(self.evadb, create_function_query.format(function_name)) # try fetching the metadata values - entries = self.evadb.catalog().get_udf_metadata_entries_by_udf_name(udf_name) + entries = self.evadb.catalog().get_function_metadata_entries_by_function_name( + function_name + ) self.assertEqual(len(entries), 2) metadata = [(entry.key, entry.value) for entry in entries] - expected_metadata = [("CACHE", "TRUE"), ("BATCH", "FALSE")] + # metadata ultimately stored as lowercase string literals in metadata + expected_metadata = [("cache", "TRUE"), ("batch", "FALSE")] self.assertEqual(set(metadata), set(expected_metadata)) - def test_should_return_empty_metadata_list_for_missing_udf(self): - # missing udf should return empty list - entries = self.evadb.catalog().get_udf_metadata_entries_by_udf_name("randomUDF") + def test_should_return_empty_metadata_list_for_missing_function(self): + # missing function should return empty list + entries = self.evadb.catalog().get_function_metadata_entries_by_function_name( + "randomFunction" + ) self.assertEqual(len(entries), 0) - def test_should_return_empty_metadata_list_if_udf_is_removed(self): - udf_name = "DummyObjectDetector" - execute_query_fetch_all(self.evadb, f"DROP UDF {udf_name};") - create_udf_query = """CREATE UDF {} + def test_should_return_empty_metadata_list_if_function_is_removed(self): + function_name = "DummyObjectDetector" + execute_query_fetch_all(self.evadb, f"DROP FUNCTION {function_name};") + create_function_query = """CREATE FUNCTION {} INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py' - 'CACHE' 'TRUE' - 'BATCH' 'FALSE'; + CACHE 'TRUE' + BATCH 'FALSE'; """ - execute_query_fetch_all(self.evadb, create_udf_query.format(udf_name)) + execute_query_fetch_all(self.evadb, create_function_query.format(function_name)) # try fetching the metadata values - entries = self.evadb.catalog().get_udf_metadata_entries_by_udf_name(udf_name) + entries = self.evadb.catalog().get_function_metadata_entries_by_function_name( + function_name + ) self.assertEqual(len(entries), 2) - # remove the udf - execute_query_fetch_all(self.evadb, f"DROP UDF {udf_name};") + # remove the function + execute_query_fetch_all(self.evadb, f"DROP FUNCTION {function_name};") # try fetching the metadata values - entries = self.evadb.catalog().get_udf_metadata_entries_by_udf_name(udf_name) + entries = self.evadb.catalog().get_function_metadata_entries_by_function_name( + function_name + ) self.assertEqual(len(entries), 0) - def test_should_raise_using_missing_udf(self): + def test_should_raise_using_missing_function(self): select_query = "SELECT id,DummyObjectDetector1(data) FROM MyVideo \ ORDER BY id;" with self.assertRaises(BinderError) as cm: @@ -224,12 +235,12 @@ def test_should_raise_using_missing_udf(self): err_msg = ( "Function 'DummyObjectDetector1' does not exist in the catalog. " - "Please create the function using CREATE UDF command." + "Please create the function using CREATE FUNCTION command." ) self.assertEqual(str(cm.exception), err_msg) - def test_should_raise_for_udf_name_mismatch(self): - create_udf_query = """CREATE UDF TestUDF + def test_should_raise_for_function_name_mismatch(self): + create_function_query = """CREATE FUNCTION TestFUNCTION INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification @@ -237,14 +248,16 @@ def test_should_raise_for_udf_name_mismatch(self): """ with self.assertRaises(ExecutorError): execute_query_fetch_all( - self.evadb, create_udf_query, do_not_print_exceptions=True + self.evadb, create_function_query, do_not_print_exceptions=True ) - def test_should_raise_if_udf_file_is_modified(self): - execute_query_fetch_all(self.evadb, "DROP UDF DummyObjectDetector;") + def test_should_raise_if_function_file_is_modified(self): + execute_query_fetch_all(self.evadb, "DROP FUNCTION DummyObjectDetector;") # Test IF EXISTS - execute_query_fetch_all(self.evadb, "DROP UDF IF EXISTS DummyObjectDetector;") + execute_query_fetch_all( + self.evadb, "DROP FUNCTION IF EXISTS DummyObjectDetector;" + ) with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as tmp_file: with open("test/util.py", "r") as file: @@ -252,18 +265,18 @@ def test_should_raise_if_udf_file_is_modified(self): tmp_file.seek(0) - udf_name = "DummyObjectDetector" - create_udf_query = """CREATE UDF {} + function_name = "DummyObjectDetector" + create_function_query = """CREATE FUNCTION {} INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL '{}'; """ execute_query_fetch_all( - self.evadb, create_udf_query.format(udf_name, tmp_file.name) + self.evadb, create_function_query.format(function_name, tmp_file.name) ) - # Modify the udf file by appending + # Modify the function file by appending tmp_file.seek(0, 2) tmp_file.write("#comment") tmp_file.seek(0) @@ -272,27 +285,29 @@ def test_should_raise_if_udf_file_is_modified(self): "SELECT id,DummyObjectDetector(data) FROM MyVideo ORDER BY id;" ) - # disabling warning for UDF modificiation for now + # disabling warning for function modificiation for now # with self.assertRaises(AssertionError): execute_query_fetch_all(self.evadb, select_query) - def test_create_udf_with_decorators(self): + def test_create_function_with_decorators(self): execute_query_fetch_all( - self.evadb, "DROP UDF IF EXISTS DummyObjectDetectorDecorators;" + self.evadb, "DROP FUNCTION IF EXISTS DummyObjectDetectorDecorators;" ) - create_udf_query = """CREATE UDF DummyObjectDetectorDecorators + create_function_query = """CREATE FUNCTION DummyObjectDetectorDecorators IMPL 'test/util.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) catalog_manager = self.evadb.catalog() - udf_obj = catalog_manager.get_udf_catalog_entry_by_name( + function_obj = catalog_manager.get_function_catalog_entry_by_name( "DummyObjectDetectorDecorators" ) - udf_inputs = catalog_manager.get_udf_io_catalog_input_entries(udf_obj) - self.assertEquals(len(udf_inputs), 1) + function_inputs = catalog_manager.get_function_io_catalog_input_entries( + function_obj + ) + self.assertEquals(len(function_inputs), 1) - udf_input = udf_inputs[0] + function_input = function_inputs[0] expected_input_attributes = { "name": "Frame_Array", @@ -304,12 +319,16 @@ def test_create_udf_with_decorators(self): } for attr in expected_input_attributes: - self.assertEquals(getattr(udf_input, attr), expected_input_attributes[attr]) + self.assertEquals( + getattr(function_input, attr), expected_input_attributes[attr] + ) - udf_outputs = catalog_manager.get_udf_io_catalog_output_entries(udf_obj) - self.assertEquals(len(udf_outputs), 1) + function_outputs = catalog_manager.get_function_io_catalog_output_entries( + function_obj + ) + self.assertEquals(len(function_outputs), 1) - udf_output = udf_outputs[0] + function_output = function_outputs[0] expected_output_attributes = { "name": "label", "type": ColumnType.NDARRAY, @@ -321,12 +340,14 @@ def test_create_udf_with_decorators(self): for attr in expected_output_attributes: self.assertEquals( - getattr(udf_output, attr), expected_output_attributes[attr] + getattr(function_output, attr), expected_output_attributes[attr] ) - def test_udf_cost_entry_created(self): + def test_function_cost_entry_created(self): execute_query_fetch_all( self.evadb, "SELECT DummyObjectDetector(data) FROM MyVideo" ) - entry = self.evadb.catalog().get_udf_cost_catalog_entry("DummyObjectDetector") + entry = self.evadb.catalog().get_function_cost_catalog_entry( + "DummyObjectDetector" + ) self.assertIsNotNone(entry) diff --git a/test/integration_tests/long/test_fuzzy_join.py b/test/integration_tests/long/test_fuzzy_join.py index 2dee91e53a..6f547102f6 100644 --- a/test/integration_tests/long/test_fuzzy_join.py +++ b/test/integration_tests/long/test_fuzzy_join.py @@ -25,8 +25,8 @@ import pytest from evadb.configuration.constants import EvaDB_ROOT_DIR +from evadb.functions.function_bootstrap_queries import fuzzy_function_query from evadb.server.command_handler import execute_query_fetch_all -from evadb.udfs.udf_bootstrap_queries import fuzzy_udf_query @pytest.mark.notparallel @@ -40,7 +40,7 @@ def setUp(self): ) self.csv_file_path = create_sample_csv() - # Prepare needed UDFs and data. + # Prepare needed Functions and data. # loading a csv requires a table to be created first create_table_query = """ CREATE TABLE IF NOT EXISTS MyVideoCSV ( @@ -73,7 +73,7 @@ def tearDown(self): execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyVideoCSV;") def test_fuzzyjoin(self): - execute_query_fetch_all(self.evadb, fuzzy_udf_query) + execute_query_fetch_all(self.evadb, fuzzy_function_query) # TODO this test does not make sense. Need to improve fuzzy_join_query = """SELECT * FROM MyVideo a JOIN MyVideoCSV b diff --git a/test/integration_tests/long/test_huggingface_udfs.py b/test/integration_tests/long/test_huggingface_functions.py similarity index 51% rename from test/integration_tests/long/test_huggingface_udfs.py rename to test/integration_tests/long/test_huggingface_functions.py index ace203c5e5..e2a258525a 100644 --- a/test/integration_tests/long/test_huggingface_udfs.py +++ b/test/integration_tests/long/test_huggingface_functions.py @@ -52,22 +52,22 @@ def tearDown(self) -> None: file_remove(self.csv_file_path) def test_io_catalog_entries_populated(self): - udf_name, task = "HFObjectDetector", "image-classification" - create_udf_query = f"""CREATE UDF {udf_name} + function_name, task = "HFObjectDetector", "image-classification" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' '{task}' + TASK '{task}' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) catalog = self.evadb.catalog() - udf = catalog.get_udf_catalog_entry_by_name(udf_name) - input_entries = catalog.get_udf_io_catalog_input_entries(udf) - output_entries = catalog.get_udf_io_catalog_output_entries(udf) + function = catalog.get_function_catalog_entry_by_name(function_name) + input_entries = catalog.get_function_io_catalog_input_entries(function) + output_entries = catalog.get_function_io_catalog_output_entries(function) # Verify that there is one input entry with the name text self.assertEqual(len(input_entries), 1) - self.assertEqual(input_entries[0].name, f"{udf_name}_IMAGE") + self.assertEqual(input_entries[0].name, f"{function_name}_IMAGE") # Verify that there are 3 output entries with the names score, label and box self.assertEqual(len(output_entries), 2) @@ -75,32 +75,32 @@ def test_io_catalog_entries_populated(self): self.assertEqual(output_entries[1].name, "label") def test_raise_error_on_unsupported_task(self): - udf_name = "HFUnsupportedTask" + function_name = "HFUnsupportedTask" task = "zero-shot-object-detection" - create_udf_query = f"""CREATE UDF {udf_name} + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' '{task}' + TASK '{task}' """ # catch an assert with self.assertRaises(ExecutorError) as exc_info: execute_query_fetch_all( - self.evadb, create_udf_query, do_not_print_exceptions=True + self.evadb, create_function_query, do_not_print_exceptions=True ) self.assertIn( f"Task {task} not supported in EvaDB currently", str(exc_info.exception) ) def test_object_detection(self): - udf_name = "HFObjectDetector" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFObjectDetector" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'object-detection' - 'model' 'facebook/detr-resnet-50'; + TASK 'object-detection' + MODEL 'facebook/detr-resnet-50'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) - select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 4;" + select_query = f"SELECT {function_name}(data) FROM DETRAC WHERE id < 4;" output = execute_query_fetch_all(self.evadb, select_query) output_frames = output.frames @@ -110,21 +110,27 @@ def test_object_detection(self): # Test that number of rows is equal to 10 self.assertEqual(len(output.frames), 4) - # Test that there exists a column with udf_name.score and each entry is a list of floats - self.assertTrue(udf_name.lower() + ".score" in output_frames.columns) + # Test that there exists a column with function_name.score and each entry is a list of floats + self.assertTrue(function_name.lower() + ".score" in output_frames.columns) self.assertTrue( - all(isinstance(x, list) for x in output.frames[udf_name.lower() + ".score"]) + all( + isinstance(x, list) + for x in output.frames[function_name.lower() + ".score"] + ) ) - # Test that there exists a column with udf_name.label and each entry is a list of strings - self.assertTrue(udf_name.lower() + ".label" in output_frames.columns) + # Test that there exists a column with function_name.label and each entry is a list of strings + self.assertTrue(function_name.lower() + ".label" in output_frames.columns) self.assertTrue( - all(isinstance(x, list) for x in output.frames[udf_name.lower() + ".label"]) + all( + isinstance(x, list) + for x in output.frames[function_name.lower() + ".label"] + ) ) - # Test that there exists a column with udf_name.box and each entry is a dictionary with 4 keys - self.assertTrue(udf_name.lower() + ".box" in output_frames.columns) - for bbox in output.frames[udf_name.lower() + ".box"]: + # Test that there exists a column with function_name.box and each entry is a dictionary with 4 keys + self.assertTrue(function_name.lower() + ".box" in output_frames.columns) + for bbox in output.frames[function_name.lower() + ".box"]: self.assertTrue(isinstance(bbox, list)) bbox = bbox[0] self.assertTrue(isinstance(bbox, dict)) @@ -134,37 +140,43 @@ def test_object_detection(self): self.assertTrue("xmax" in bbox) self.assertTrue("ymax" in bbox) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) def test_image_classification(self): - udf_name = "HFImageClassifier" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFImageClassifier" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'image-classification' + TASK 'image-classification' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) - select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 3;" + select_query = f"SELECT {function_name}(data) FROM DETRAC WHERE id < 3;" output = execute_query_fetch_all(self.evadb, select_query) # Test that output has 2 columns self.assertEqual(len(output.frames.columns), 2) - # Test that there exists a column with udf_name.score and each entry is a list of floats - self.assertTrue(udf_name.lower() + ".score" in output.frames.columns) + # Test that there exists a column with function_name.score and each entry is a list of floats + self.assertTrue(function_name.lower() + ".score" in output.frames.columns) self.assertTrue( - all(isinstance(x, list) for x in output.frames[udf_name.lower() + ".score"]) + all( + isinstance(x, list) + for x in output.frames[function_name.lower() + ".score"] + ) ) - # Test that there exists a column with udf_name.label and each entry is a list of strings - self.assertTrue(udf_name.lower() + ".label" in output.frames.columns) + # Test that there exists a column with function_name.label and each entry is a list of strings + self.assertTrue(function_name.lower() + ".label" in output.frames.columns) self.assertTrue( - all(isinstance(x, list) for x in output.frames[udf_name.lower() + ".label"]) + all( + isinstance(x, list) + for x in output.frames[function_name.lower() + ".label"] + ) ) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) @pytest.mark.benchmark def test_text_classification(self): @@ -177,51 +189,52 @@ def test_text_classification(self): load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;""" execute_query_fetch_all(self.evadb, load_table_query) - udf_name = "HFTextClassifier" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFTextClassifier" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'text-classification' + TASK 'text-classification' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) - select_query = f"SELECT {udf_name}(comment) FROM MyCSV;" + select_query = f"SELECT {function_name}(comment) FROM MyCSV;" output = execute_query_fetch_all(self.evadb, select_query) # Test that output has 2 columns self.assertEqual(len(output.frames.columns), 2) - # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE" - self.assertTrue(udf_name.lower() + ".label" in output.frames.columns) + # Test that there exists a column with function_name.label and each entry is either "POSITIVE" or "NEGATIVE" + self.assertTrue(function_name.lower() + ".label" in output.frames.columns) self.assertTrue( all( x in ["POSITIVE", "NEGATIVE"] - for x in output.frames[udf_name.lower() + ".label"] + for x in output.frames[function_name.lower() + ".label"] ) ) - # Test that there exists a column with udf_name.score and each entry is a float - self.assertTrue(udf_name.lower() + ".score" in output.frames.columns) + # Test that there exists a column with function_name.score and each entry is a float + self.assertTrue(function_name.lower() + ".score" in output.frames.columns) self.assertTrue( all( - isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"] + isinstance(x, float) + for x in output.frames[function_name.lower() + ".score"] ) ) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) execute_query_fetch_all(self.evadb, "DROP TABLE MyCSV;") @pytest.mark.benchmark def test_automatic_speech_recognition(self): - udf_name = "SpeechRecognizer" - create_udf = ( - f"CREATE UDF {udf_name} TYPE HuggingFace " - "'task' 'automatic-speech-recognition' 'model' 'openai/whisper-base';" + function_name = "SpeechRecognizer" + create_function = ( + f"CREATE FUNCTION {function_name} TYPE HuggingFace " + "TASK 'automatic-speech-recognition' MODEL 'openai/whisper-base';" ) - execute_query_fetch_all(self.evadb, create_udf) + execute_query_fetch_all(self.evadb, create_function) # TODO: use with SAMPLE AUDIORATE 16000 - select_query = f"SELECT {udf_name}(audio) FROM VIDEOS;" + select_query = f"SELECT {function_name}(audio) FROM VIDEOS;" output = execute_query_fetch_all(self.evadb, select_query) # verify that output has one row and one column only @@ -229,9 +242,7 @@ def test_automatic_speech_recognition(self): # verify that speech was converted to text correctly self.assertTrue(output.frames.iloc[0][0].count("touchdown") == 2) - select_query_with_group_by = ( - f"SELECT {udf_name}(SEGMENT(audio)) FROM VIDEOS GROUP BY '240 samples';" - ) + select_query_with_group_by = f"SELECT {function_name}(SEGMENT(audio)) FROM VIDEOS GROUP BY '240 samples';" output = execute_query_fetch_all(self.evadb, select_query_with_group_by) # verify that output has one row and one column only @@ -239,27 +250,27 @@ def test_automatic_speech_recognition(self): # verify that speech was converted to text correctly self.assertEquals(output.frames.iloc[0][0].count("touchdown"), 1) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) @pytest.mark.benchmark def test_summarization_from_video(self): - asr_udf = "SpeechRecognizer" - create_udf = ( - f"CREATE UDF {asr_udf} TYPE HuggingFace " - "'task' 'automatic-speech-recognition' 'model' 'openai/whisper-base';" + asr_function = "SpeechRecognizer" + create_function = ( + f"CREATE FUNCTION {asr_function} TYPE HuggingFace " + "TASK 'automatic-speech-recognition' MODEL 'openai/whisper-base';" ) - execute_query_fetch_all(self.evadb, create_udf) + execute_query_fetch_all(self.evadb, create_function) - summary_udf = "Summarizer" - create_udf = ( - f"CREATE UDF {summary_udf} TYPE HuggingFace " - "'task' 'summarization' 'model' 'philschmid/bart-large-cnn-samsum' 'min_length' 10 'max_new_tokens' 100;" + summary_function = "Summarizer" + create_function = ( + f"CREATE FUNCTION {summary_function} TYPE HuggingFace " + "TASK 'summarization' MODEL 'philschmid/bart-large-cnn-samsum' MIN_LENGTH 10 MAX_NEW_TOKENS 100;" ) - execute_query_fetch_all(self.evadb, create_udf) + execute_query_fetch_all(self.evadb, create_function) # TODO: use with SAMPLE AUDIORATE 16000 - select_query = f"SELECT {summary_udf}({asr_udf}(audio)) FROM VIDEOS;" + select_query = f"SELECT {summary_function}({asr_function}(audio)) FROM VIDEOS;" output = execute_query_fetch_all(self.evadb, select_query) # verify that output has one row and one column only @@ -270,19 +281,19 @@ def test_summarization_from_video(self): == "Jalen Hurts has scored his second rushing touchdown of the game." ) - drop_udf_query = f"DROP UDF {asr_udf};" - execute_query_fetch_all(self.evadb, drop_udf_query) - drop_udf_query = f"DROP UDF {summary_udf};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {asr_function};" + execute_query_fetch_all(self.evadb, drop_function_query) + drop_function_query = f"DROP FUNCTION {summary_function};" + execute_query_fetch_all(self.evadb, drop_function_query) def test_toxicity_classification(self): - udf_name = "HFToxicityClassifier" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFToxicityClassifier" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'text-classification' - 'model' 'martin-ha/toxic-comment-model' + TASK 'text-classification' + MODEL 'martin-ha/toxic-comment-model' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) drop_table_query = """DROP TABLE IF EXISTS MyCSV;""" execute_query_fetch_all(self.evadb, drop_table_query) @@ -296,42 +307,43 @@ def test_toxicity_classification(self): load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;""" execute_query_fetch_all(self.evadb, load_table_query) - select_query = f"SELECT {udf_name}(comment) FROM MyCSV;" + select_query = f"SELECT {function_name}(comment) FROM MyCSV;" output = execute_query_fetch_all(self.evadb, select_query) # Test that output has 2 columns self.assertEqual(len(output.frames.columns), 2) - # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE" - self.assertTrue(udf_name.lower() + ".label" in output.frames.columns) + # Test that there exists a column with function_name.label and each entry is either "POSITIVE" or "NEGATIVE" + self.assertTrue(function_name.lower() + ".label" in output.frames.columns) self.assertTrue( all( x in ["non-toxic", "toxic"] - for x in output.frames[udf_name.lower() + ".label"] + for x in output.frames[function_name.lower() + ".label"] ) ) - # Test that there exists a column with udf_name.score + # Test that there exists a column with function_name.score # and each entry is a float - self.assertTrue(udf_name.lower() + ".score" in output.frames.columns) + self.assertTrue(function_name.lower() + ".score" in output.frames.columns) self.assertTrue( all( - isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"] + isinstance(x, float) + for x in output.frames[function_name.lower() + ".score"] ) ) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) @pytest.mark.benchmark def test_multilingual_toxicity_classification(self): - udf_name = "HFMultToxicityClassifier" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFMultToxicityClassifier" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'text-classification' - 'model' 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus' + TASK 'text-classification' + MODEL 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) drop_table_query = """DROP TABLE IF EXISTS MyCSV;""" execute_query_fetch_all(self.evadb, drop_table_query) @@ -345,56 +357,57 @@ def test_multilingual_toxicity_classification(self): load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;""" execute_query_fetch_all(self.evadb, load_table_query) - select_query = f"SELECT {udf_name}(comment) FROM MyCSV;" + select_query = f"SELECT {function_name}(comment) FROM MyCSV;" output = execute_query_fetch_all(self.evadb, select_query) # Test that output has 2 columns self.assertEqual(len(output.frames.columns), 2) - # Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE" - self.assertTrue(udf_name.lower() + ".label" in output.frames.columns) + # Test that there exists a column with function_name.label and each entry is either "POSITIVE" or "NEGATIVE" + self.assertTrue(function_name.lower() + ".label" in output.frames.columns) self.assertTrue( all( x in ["LABEL_1", "LABEL_0"] - for x in output.frames[udf_name.lower() + ".label"] + for x in output.frames[function_name.lower() + ".label"] ) ) - # Test that there exists a column with udf_name.score and each entry is a float - self.assertTrue(udf_name.lower() + ".score" in output.frames.columns) + # Test that there exists a column with function_name.score and each entry is a float + self.assertTrue(function_name.lower() + ".score" in output.frames.columns) self.assertTrue( all( - isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"] + isinstance(x, float) + for x in output.frames[function_name.lower() + ".score"] ) ) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) @pytest.mark.benchmark def test_named_entity_recognition_model_all_pdf_data(self): - udf_name = "HFNERModel" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFNERModel" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'ner' + TASK 'ner' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) # running test case on all the pdf data - select_query = f"SELECT data, {udf_name}(data) FROM MyPDFs;" + select_query = f"SELECT data, {function_name}(data) FROM MyPDFs;" output = execute_query_fetch_all(self.evadb, select_query) # Test that output has 7 columns self.assertEqual(len(output.frames.columns), 7) - # Test that there exists a column with udf_name.entity - self.assertTrue(udf_name.lower() + ".entity" in output.frames.columns) + # Test that there exists a column with function_name.entity + self.assertTrue(function_name.lower() + ".entity" in output.frames.columns) - # Test that there exists a column with udf_name.score - self.assertTrue(udf_name.lower() + ".score" in output.frames.columns) + # Test that there exists a column with function_name.score + self.assertTrue(function_name.lower() + ".score" in output.frames.columns) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) def test_select_and_groupby_with_paragraphs(self): segment_size = 10 @@ -408,15 +421,15 @@ def test_select_and_groupby_with_paragraphs(self): @pytest.mark.benchmark def test_named_entity_recognition_model_no_ner_data_exists(self): - udf_name = "HFNERModel" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFNERModel" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'ner' + TASK 'ner' """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) # running test case where ner gives no data - select_query = f"""SELECT data, {udf_name}(data) + select_query = f"""SELECT data, {function_name}(data) FROM MyPDFs WHERE page = 3 AND paragraph >= 1 AND paragraph <= 3;""" @@ -425,11 +438,11 @@ def test_named_entity_recognition_model_no_ner_data_exists(self): # Test that output only has 1 column (data) self.assertEqual(len(output.frames.columns), 1) - # Test that there does not exist a column with udf_name.entity - self.assertFalse(udf_name.lower() + ".entity" in output.frames.columns) + # Test that there does not exist a column with function_name.entity + self.assertFalse(function_name.lower() + ".entity" in output.frames.columns) - drop_udf_query = f"DROP UDF {udf_name};" - execute_query_fetch_all(self.evadb, drop_udf_query) + drop_function_query = f"DROP FUNCTION {function_name};" + execute_query_fetch_all(self.evadb, drop_function_query) if __name__ == "__main__": diff --git a/test/integration_tests/long/test_like.py b/test/integration_tests/long/test_like.py index b3fe8137e7..a3e7509935 100644 --- a/test/integration_tests/long/test_like.py +++ b/test/integration_tests/long/test_like.py @@ -38,15 +38,15 @@ def tearDown(self): @ocr_skip_marker def test_like_with_ocr(self): - create_udf_query = """CREATE UDF IF NOT EXISTS OCRExtractor + create_function_query = """CREATE FUNCTION IF NOT EXISTS OCRExtractor INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(10), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE OCRExtraction - IMPL 'evadb/udfs/ocr_extractor.py'; + IMPL 'evadb/functions/ocr_extractor.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT X.label, X.x, X.y FROM MemeImages JOIN LATERAL UNNEST(OCRExtractor(data)) AS X(label, x, y) WHERE label LIKE {};""".format( r"""'.*SWAG.*'""" ) @@ -55,15 +55,15 @@ def test_like_with_ocr(self): @ocr_skip_marker def test_like_fails_on_non_string_col(self): - create_udf_query = """CREATE UDF IF NOT EXISTS OCRExtractor + create_function_query = """CREATE FUNCTION IF NOT EXISTS OCRExtractor INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(10), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE OCRExtraction - IMPL 'evadb/udfs/ocr_extractor.py'; + IMPL 'evadb/functions/ocr_extractor.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT * FROM MemeImages JOIN LATERAL UNNEST(OCRExtractor(data)) AS X(label, x, y) WHERE x LIKE "[A-Za-z]*CANT";""" with self.assertRaises(Exception): diff --git a/test/integration_tests/long/test_model_forecasting.py b/test/integration_tests/long/test_model_forecasting.py new file mode 100644 index 0000000000..288eb8dd72 --- /dev/null +++ b/test/integration_tests/long/test_model_forecasting.py @@ -0,0 +1,69 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.markers import forecast_skip_marker +from test.util import get_evadb_for_testing, shutdown_ray + +import pytest + +from evadb.configuration.constants import EvaDB_ROOT_DIR +from evadb.server.command_handler import execute_query_fetch_all + + +@pytest.mark.notparallel +class ModelTrainTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + cls.evadb.catalog().reset() + + create_table_query = """ + CREATE TABLE AirData (\ + unique_id TEXT(30),\ + ds TEXT(30),\ + y INTEGER);""" + execute_query_fetch_all(cls.evadb, create_table_query) + + path = f"{EvaDB_ROOT_DIR}/data/forecasting/air-passengers.csv" + load_query = f"LOAD CSV '{path}' INTO AirData;" + execute_query_fetch_all(cls.evadb, load_query) + + @classmethod + def tearDownClass(cls): + shutdown_ray() + + # clean up + execute_query_fetch_all(cls.evadb, "DROP TABLE IF EXISTS HomeRentals;") + + @forecast_skip_marker + def test_forecast(self): + create_predict_udf = """ + CREATE FUNCTION Forecast FROM + (SELECT unique_id, ds, y FROM AirData) + TYPE Forecasting + PREDICT 'y'; + """ + execute_query_fetch_all(self.evadb, create_predict_udf) + + predict_query = """ + SELECT Forecast(12) FROM AirData; + """ + result = execute_query_fetch_all(self.evadb, predict_query) + self.assertEqual(int(list(result.frames.iloc[:, -1])[-1]), 459) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/integration_tests/long/test_model_train.py b/test/integration_tests/long/test_model_train.py index cb25865b68..55ae6da9c1 100644 --- a/test/integration_tests/long/test_model_train.py +++ b/test/integration_tests/long/test_model_train.py @@ -56,14 +56,14 @@ def tearDownClass(cls): @ludwig_skip_marker def test_ludwig_automl(self): - create_predict_udf = """ - CREATE UDF IF NOT EXISTS PredictHouseRent FROM + create_predict_function = """ + CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM ( SELECT * FROM HomeRentals ) TYPE Ludwig - 'predict' 'rental_price' - 'time_limit' 120; + PREDICT 'rental_price' + TIME_LIMIT 120; """ - execute_query_fetch_all(self.evadb, create_predict_udf) + execute_query_fetch_all(self.evadb, create_predict_function) predict_query = """ SELECT PredictHouseRent(*) FROM HomeRentals LIMIT 10; diff --git a/test/integration_tests/long/test_open.py b/test/integration_tests/long/test_open.py index dac1ee22cb..d82e912a6c 100644 --- a/test/integration_tests/long/test_open.py +++ b/test/integration_tests/long/test_open.py @@ -17,7 +17,7 @@ create_sample_image, file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -35,8 +35,8 @@ class OpenTests(unittest.TestCase): def setUp(self): self.evadb = get_evadb_for_testing() self.evadb.catalog().reset() - # Load built-in UDFs. - load_udfs_for_testing(self.evadb, mode="debug") + # Load built-in Functions. + load_functions_for_testing(self.evadb, mode="debug") # Insert image path. self.img_path = create_sample_image() diff --git a/test/integration_tests/long/test_optimizer_rules.py b/test/integration_tests/long/test_optimizer_rules.py index 29f7b339d9..27f5189a4b 100644 --- a/test/integration_tests/long/test_optimizer_rules.py +++ b/test/integration_tests/long/test_optimizer_rules.py @@ -17,7 +17,7 @@ from test.util import ( get_evadb_for_testing, get_physical_query_plan, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -52,7 +52,7 @@ def setUpClass(cls): ua_detrac = f"{EvaDB_ROOT_DIR}/data/ua_detrac/ua_detrac.mp4" execute_query_fetch_all(cls.evadb, f"LOAD VIDEO '{ua_detrac}' INTO MyVideo;") execute_query_fetch_all(cls.evadb, f"LOAD VIDEO '{ua_detrac}' INTO MyVideo2;") - load_udfs_for_testing(cls.evadb, mode="debug") + load_functions_for_testing(cls.evadb, mode="debug") @classmethod def tearDownClass(cls): @@ -146,7 +146,9 @@ def test_should_pushdown_without_pushdown_join_rule(self): self.assertEqual(result_without_pushdown_join_rule, result_with_rule) self.assertEqual(query_plan, query_plan_without_pushdown_join_rule) - @patch("evadb.catalog.catalog_manager.CatalogManager.get_udf_cost_catalog_entry") + @patch( + "evadb.catalog.catalog_manager.CatalogManager.get_function_cost_catalog_entry" + ) def test_should_reorder_predicates(self, mock): def _check_reorder(cost_func): mock.side_effect = cost_func @@ -179,7 +181,9 @@ def _check_reorder(cost_func): lambda name: MagicMock(cost=5) if name == "DummyObjectDetector" else None ) - @patch("evadb.catalog.catalog_manager.CatalogManager.get_udf_cost_catalog_entry") + @patch( + "evadb.catalog.catalog_manager.CatalogManager.get_function_cost_catalog_entry" + ) def test_should_not_reorder_predicates(self, mock): def _check_no_reorder(cost_func): mock.side_effect = cost_func @@ -213,15 +217,17 @@ def _check_no_reorder(cost_func): else MagicMock(cost=5) ) - # no reordering if default cost is used for one UDF + # no reordering if default cost is used for one Function _check_no_reorder( lambda name: MagicMock(cost=5) if name == "DummyObjectDetector" else None ) - # no reordering if default cost is used for both UDF + # no reordering if default cost is used for both Function _check_no_reorder(lambda name: None) - @patch("evadb.catalog.catalog_manager.CatalogManager.get_udf_cost_catalog_entry") + @patch( + "evadb.catalog.catalog_manager.CatalogManager.get_function_cost_catalog_entry" + ) def test_should_reorder_multiple_predicates(self, mock): def side_effect_func(name): if name == "DummyMultiObjectDetector": diff --git a/test/integration_tests/long/test_pytorch.py b/test/integration_tests/long/test_pytorch.py index 96af1605d2..f30b4af9fe 100644 --- a/test/integration_tests/long/test_pytorch.py +++ b/test/integration_tests/long/test_pytorch.py @@ -24,7 +24,7 @@ create_sample_video, file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -34,9 +34,12 @@ from evadb.configuration.constants import EvaDB_ROOT_DIR from evadb.executor.executor_utils import ExecutorError +from evadb.functions.function_bootstrap_queries import ( + Asl_function_query, + Mvit_function_query, +) from evadb.models.storage.batch import Batch from evadb.server.command_handler import execute_query_fetch_all -from evadb.udfs.udf_bootstrap_queries import Asl_udf_query, Mvit_udf_query from evadb.utils.generic_utils import try_to_import_cv2 @@ -63,7 +66,7 @@ def setUpClass(cls): ) execute_query_fetch_all(cls.evadb, f"LOAD IMAGE '{meme1}' INTO MemeImages;") execute_query_fetch_all(cls.evadb, f"LOAD IMAGE '{meme2}' INTO MemeImages;") - load_udfs_for_testing(cls.evadb) + load_functions_for_testing(cls.evadb) @classmethod def tearDownClass(cls): @@ -116,14 +119,14 @@ def test_should_apply_parallel_match_sequential(self): @ray_skip_marker def test_should_project_parallel_match_sequential(self): - create_udf_query = """CREATE UDF IF NOT EXISTS FaceDetector + create_function_query = """CREATE FUNCTION IF NOT EXISTS FaceDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE FaceDetection - IMPL 'evadb/udfs/face_detector.py'; + IMPL 'evadb/functions/face_detector.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = "SELECT FaceDetector(data) FROM MyVideo WHERE id < 5;" # Parallel execution @@ -167,7 +170,7 @@ def test_should_run_pytorch_and_fastrcnn_with_lateral_join(self): @pytest.mark.torchtest def test_should_run_pytorch_and_yolo_and_mvit(self): - execute_query_fetch_all(self.evadb, Mvit_udf_query) + execute_query_fetch_all(self.evadb, Mvit_function_query) select_query = """SELECT FIRST(id), Yolo(FIRST(data)), @@ -187,7 +190,7 @@ def test_should_run_pytorch_and_yolo_and_mvit(self): @pytest.mark.torchtest def test_should_run_pytorch_and_asl(self): - execute_query_fetch_all(self.evadb, Asl_udf_query) + execute_query_fetch_all(self.evadb, Asl_function_query) select_query = """SELECT FIRST(id), ASLActionRecognition(SEGMENT(data)) FROM Asl_actions SAMPLE 5 @@ -202,14 +205,14 @@ def test_should_run_pytorch_and_asl(self): @pytest.mark.torchtest def test_should_run_pytorch_and_facenet(self): - create_udf_query = """CREATE UDF IF NOT EXISTS FaceDetector + create_function_query = """CREATE FUNCTION IF NOT EXISTS FaceDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE FaceDetection - IMPL 'evadb/udfs/face_detector.py'; + IMPL 'evadb/functions/face_detector.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT FaceDetector(data) FROM MyVideo WHERE id < 5;""" @@ -220,15 +223,15 @@ def test_should_run_pytorch_and_facenet(self): @windows_skip_marker @ocr_skip_marker def test_should_run_pytorch_and_ocr(self): - create_udf_query = """CREATE UDF IF NOT EXISTS OCRExtractor + create_function_query = """CREATE FUNCTION IF NOT EXISTS OCRExtractor INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (labels NDARRAY STR(10), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE OCRExtraction - IMPL 'evadb/udfs/ocr_extractor.py'; + IMPL 'evadb/functions/ocr_extractor.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT OCRExtractor(data) FROM MNIST WHERE id >= 150 AND id < 155;""" @@ -242,13 +245,13 @@ def test_should_run_pytorch_and_ocr(self): @pytest.mark.torchtest def test_should_run_pytorch_and_resnet50(self): - create_udf_query = """CREATE UDF IF NOT EXISTS FeatureExtractor + create_function_query = """CREATE FUNCTION IF NOT EXISTS FeatureExtractor INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (features NDARRAY FLOAT32(ANYDIM)) TYPE Classification - IMPL 'evadb/udfs/feature_extractor.py'; + IMPL 'evadb/functions/feature_extractor.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT FeatureExtractor(data) FROM MyVideo WHERE id < 5;""" @@ -262,31 +265,31 @@ def test_should_run_pytorch_and_resnet50(self): @pytest.mark.torchtest def test_should_run_pytorch_and_similarity(self): - create_open_udf_query = """CREATE UDF IF NOT EXISTS Open + create_open_function_query = """CREATE FUNCTION IF NOT EXISTS Open INPUT (img_path TEXT(1000)) OUTPUT (data NDARRAY UINT8(3, ANYDIM, ANYDIM)) - TYPE NdarrayUDF - IMPL "evadb/udfs/ndarray/open.py"; + TYPE NdarrayFUNCTION + IMPL "evadb/functions/ndarray/open.py"; """ - execute_query_fetch_all(self.evadb, create_open_udf_query) + execute_query_fetch_all(self.evadb, create_open_function_query) - create_similarity_udf_query = """CREATE UDF IF NOT EXISTS Similarity + create_similarity_function_query = """CREATE FUNCTION IF NOT EXISTS Similarity INPUT (Frame_Array_Open NDARRAY UINT8(3, ANYDIM, ANYDIM), Frame_Array_Base NDARRAY UINT8(3, ANYDIM, ANYDIM), Feature_Extractor_Name TEXT(100)) OUTPUT (distance FLOAT(32, 7)) - TYPE NdarrayUDF - IMPL "evadb/udfs/ndarray/similarity.py"; + TYPE NdarrayFUNCTION + IMPL "evadb/functions/ndarray/similarity.py"; """ - execute_query_fetch_all(self.evadb, create_similarity_udf_query) + execute_query_fetch_all(self.evadb, create_similarity_function_query) - create_feat_udf_query = """CREATE UDF IF NOT EXISTS FeatureExtractor + create_feat_function_query = """CREATE FUNCTION IF NOT EXISTS FeatureExtractor INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (features NDARRAY FLOAT32(ANYDIM)) TYPE Classification - IMPL "evadb/udfs/feature_extractor.py"; + IMPL "evadb/functions/feature_extractor.py"; """ - execute_query_fetch_all(self.evadb, create_feat_udf_query) + execute_query_fetch_all(self.evadb, create_feat_function_query) select_query = """SELECT data FROM MyVideo WHERE id = 1;""" batch_res = execute_query_fetch_all(self.evadb, select_query) @@ -320,15 +323,15 @@ def test_should_run_pytorch_and_similarity(self): @windows_skip_marker @ocr_skip_marker def test_should_run_ocr_on_cropped_data(self): - create_udf_query = """CREATE UDF IF NOT EXISTS OCRExtractor + create_function_query = """CREATE FUNCTION IF NOT EXISTS OCRExtractor INPUT (text NDARRAY STR(100)) OUTPUT (labels NDARRAY STR(10), bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE OCRExtraction - IMPL 'evadb/udfs/ocr_extractor.py'; + IMPL 'evadb/functions/ocr_extractor.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT OCRExtractor(Crop(data, [2, 2, 24, 24])) FROM MNIST WHERE id >= 150 AND id < 155;""" diff --git a/test/integration_tests/long/test_reuse.py b/test/integration_tests/long/test_reuse.py index a1e60ba354..eefcda67eb 100644 --- a/test/integration_tests/long/test_reuse.py +++ b/test/integration_tests/long/test_reuse.py @@ -20,7 +20,7 @@ from test.util import ( get_evadb_for_testing, get_logical_query_plan, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -39,20 +39,20 @@ class ReuseTest(unittest.TestCase): def _load_hf_model(self): - udf_name = "HFObjectDetector" - create_udf_query = f"""CREATE UDF {udf_name} + function_name = "HFObjectDetector" + create_function_query = f"""CREATE FUNCTION {function_name} TYPE HuggingFace - 'task' 'object-detection' - 'model' 'facebook/detr-resnet-50'; + TASK 'object-detection' + MODEL 'facebook/detr-resnet-50'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) def setUp(self): self.evadb = get_evadb_for_testing() self.evadb.catalog().reset() ua_detrac = f"{EvaDB_ROOT_DIR}/data/ua_detrac/ua_detrac.mp4" execute_query_fetch_all(self.evadb, f"LOAD VIDEO '{ua_detrac}' INTO DETRAC;") - load_udfs_for_testing(self.evadb) + load_functions_for_testing(self.evadb) self._load_hf_model() def tearDown(self): @@ -149,7 +149,7 @@ def test_reuse_logical_project_with_duplicate_query(self): self.assertGreater(exec_times[0], exec_times[1]) @gpu_skip_marker - def test_reuse_with_udf_in_predicate(self): + def test_reuse_with_function_in_predicate(self): select_query = """SELECT id FROM DETRAC WHERE ['car'] <@ HFObjectDetector(data).label AND id < 4""" batches, exec_times = self._reuse_experiment([select_query, select_query]) @@ -158,7 +158,7 @@ def test_reuse_with_udf_in_predicate(self): self.assertGreater(exec_times[0], exec_times[1]) @gpu_skip_marker - def test_reuse_across_different_predicate_using_same_udf(self): + def test_reuse_across_different_predicate_using_same_function(self): query1 = """SELECT id FROM DETRAC WHERE ['car'] <@ HFObjectDetector(data).label AND id < 15""" query2 = """SELECT id FROM DETRAC WHERE ArrayCount(HFObjectDetector(data).label, 'car') > 3 AND id < 12;""" @@ -211,7 +211,7 @@ def test_reuse_after_server_shutdown(self): # stop the server os.system("nohup evadb_server --stop") - def test_drop_udf_should_remove_cache(self): + def test_drop_function_should_remove_cache(self): select_query = """SELECT id, label FROM DETRAC JOIN LATERAL Yolo(data) AS Obj(label, bbox, conf) WHERE id < 5;""" execute_query_fetch_all(self.evadb, select_query) @@ -224,15 +224,19 @@ def test_drop_udf_should_remove_cache(self): cache_name = plan.func_expr.signature() # cache exists - udf_cache = self.evadb.catalog().get_udf_cache_catalog_entry_by_name(cache_name) - cache_dir = Path(udf_cache.cache_path) - self.assertIsNotNone(udf_cache) + function_cache = self.evadb.catalog().get_function_cache_catalog_entry_by_name( + cache_name + ) + cache_dir = Path(function_cache.cache_path) + self.assertIsNotNone(function_cache) self.assertTrue(cache_dir.exists()) - # cache should be removed if the UDF is removed - execute_query_fetch_all(self.evadb, "DROP UDF Yolo;") - udf_cache = self.evadb.catalog().get_udf_cache_catalog_entry_by_name(cache_name) - self.assertIsNone(udf_cache) + # cache should be removed if the FUNCTION is removed + execute_query_fetch_all(self.evadb, "DROP FUNCTION Yolo;") + function_cache = self.evadb.catalog().get_function_cache_catalog_entry_by_name( + cache_name + ) + self.assertIsNone(function_cache) self.assertFalse(cache_dir.exists()) def test_drop_table_should_remove_cache(self): @@ -248,13 +252,17 @@ def test_drop_table_should_remove_cache(self): cache_name = plan.func_expr.signature() # cache exists - udf_cache = self.evadb.catalog().get_udf_cache_catalog_entry_by_name(cache_name) - cache_dir = Path(udf_cache.cache_path) - self.assertIsNotNone(udf_cache) + function_cache = self.evadb.catalog().get_function_cache_catalog_entry_by_name( + cache_name + ) + cache_dir = Path(function_cache.cache_path) + self.assertIsNotNone(function_cache) self.assertTrue(cache_dir.exists()) # cache should be removed if the Table is removed execute_query_fetch_all(self.evadb, "DROP TABLE DETRAC;") - udf_cache = self.evadb.catalog().get_udf_cache_catalog_entry_by_name(cache_name) - self.assertIsNone(udf_cache) + function_cache = self.evadb.catalog().get_function_cache_catalog_entry_by_name( + cache_name + ) + self.assertIsNone(function_cache) self.assertFalse(cache_dir.exists()) diff --git a/test/integration_tests/long/test_saliency.py b/test/integration_tests/long/test_saliency.py index cfac85b778..92efc265d3 100644 --- a/test/integration_tests/long/test_saliency.py +++ b/test/integration_tests/long/test_saliency.py @@ -40,19 +40,19 @@ def tearDownClass(cls): @unittest.skip("Not supported in current version") def test_saliency(self): Saliency1 = f"{EvaDB_ROOT_DIR}/data/saliency/test1.jpeg" - create_udf_query = f"LOAD IMAGE '{Saliency1}' INTO SALIENCY;" + create_function_query = f"LOAD IMAGE '{Saliency1}' INTO SALIENCY;" execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS SALIENCY;") - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) execute_query_fetch_all( - self.evadb, "DROP UDF IF EXISTS SaliencyFeatureExtractor" + self.evadb, "DROP FUNCTION IF EXISTS SaliencyFeatureExtractor" ) - create_udf_query = f"""CREATE UDF IF NOT EXISTS SaliencyFeatureExtractor - IMPL '{EvaDB_ROOT_DIR}/evadb/udfs/saliency_feature_extractor.py'; + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS SaliencyFeatureExtractor + IMPL '{EvaDB_ROOT_DIR}/evadb/functions/saliency_feature_extractor.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query_saliency = """SELECT data, SaliencyFeatureExtractor(data) FROM SALIENCY diff --git a/test/integration_tests/long/test_select_executor.py b/test/integration_tests/long/test_select_executor.py index a01f047d0a..a48578980d 100644 --- a/test/integration_tests/long/test_select_executor.py +++ b/test/integration_tests/long/test_select_executor.py @@ -19,7 +19,7 @@ create_table, file_remove, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -47,7 +47,7 @@ def setUpClass(cls): ua_detrac = f"{EvaDB_ROOT_DIR}/data/ua_detrac/ua_detrac.mp4" load_query = f"LOAD VIDEO '{ua_detrac}' INTO DETRAC;" execute_query_fetch_all(cls.evadb, load_query) - load_udfs_for_testing(cls.evadb) + load_functions_for_testing(cls.evadb) cls.table1 = create_table(cls.evadb, "table1", 100, 3) cls.table2 = create_table(cls.evadb, "table2", 500, 3) cls.table3 = create_table(cls.evadb, "table3", 1000, 3) diff --git a/test/integration_tests/long/test_similarity.py b/test/integration_tests/long/test_similarity.py index af95e35388..85746af785 100644 --- a/test/integration_tests/long/test_similarity.py +++ b/test/integration_tests/long/test_similarity.py @@ -18,7 +18,7 @@ from test.util import ( create_sample_image, get_evadb_for_testing, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -38,8 +38,8 @@ def setUp(self): self.evadb = get_evadb_for_testing() self.evadb.catalog().reset() - # Prepare needed UDFs and data_col. - load_udfs_for_testing(self.evadb, mode="debug") + # Prepare needed Functions and data_col. + load_functions_for_testing(self.evadb, mode="debug") self.img_path = create_sample_image() # Create base comparison table. @@ -128,7 +128,7 @@ def tearDown(self): def test_similarity_should_work_in_order(self): ############################################### - # Test case runs with UDF on raw input table. # + # Test case runs with Function on raw input table. # ############################################### # Top 1 - assume table contains base data_col. @@ -253,7 +253,7 @@ def test_should_do_vector_index_scan(self): ) ############################################### - # Test case runs with UDF on raw input table. # + # Test case runs with Function on raw input table. # ############################################### # Execution without index scan. diff --git a/test/integration_tests/long/test_text_filtering.py b/test/integration_tests/long/test_text_filtering.py index 9376646c50..a03ddf9729 100644 --- a/test/integration_tests/long/test_text_filtering.py +++ b/test/integration_tests/long/test_text_filtering.py @@ -41,7 +41,7 @@ def test_text_filter(self): cursor.create_function( "TextFilterKeyword", True, - f"{EvaDB_ROOT_DIR}/evadb/udfs/text_filter_keyword.py", + f"{EvaDB_ROOT_DIR}/evadb/functions/text_filter_keyword.py", ).df() filtered_data = ( cursor.table("MyPDFs") diff --git a/test/integration_tests/short/test_create_database_executor.py b/test/integration_tests/short/test_create_database_executor.py index 29bcc3d0de..7f5d45a44e 100644 --- a/test/integration_tests/short/test_create_database_executor.py +++ b/test/integration_tests/short/test_create_database_executor.py @@ -15,6 +15,8 @@ import unittest from test.util import get_evadb_for_testing, shutdown_ray +from mock import patch + from evadb.server.command_handler import execute_query_fetch_all @@ -42,8 +44,8 @@ def test_create_database_should_add_the_entry(self): PARAMETERS = {};""".format( params ) - - execute_query_fetch_all(self.evadb, query) + with patch("evadb.executor.create_database_executor.get_database_handler"): + execute_query_fetch_all(self.evadb, query) db_entry = self.evadb.catalog().get_database_catalog_entry("demo_db") self.assertEqual(db_entry.name, "demo_db") diff --git a/test/integration_tests/short/test_drop_executor.py b/test/integration_tests/short/test_drop_executor.py index 3800f6405b..fb5fd4339b 100644 --- a/test/integration_tests/short/test_drop_executor.py +++ b/test/integration_tests/short/test_drop_executor.py @@ -112,48 +112,56 @@ def test_should_drop_table(self): self.evadb, drop_query, do_not_print_exceptions=True ) - def run_create_udf_query(self): - create_udf_query = """CREATE UDF DummyObjectDetector + def run_create_function_query(self): + create_function_query = """CREATE FUNCTION DummyObjectDetector INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py';""" - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) - def test_should_drop_udf(self): - self.run_create_udf_query() - udf_name = "DummyObjectDetector" - udf = self.evadb.catalog().get_udf_catalog_entry_by_name(udf_name) - self.assertTrue(udf is not None) + def test_should_drop_function(self): + self.run_create_function_query() + function_name = "DummyObjectDetector" + function = self.evadb.catalog().get_function_catalog_entry_by_name( + function_name + ) + self.assertTrue(function is not None) - # Test that dropping the UDF reflects in the catalog - drop_query = "DROP UDF IF EXISTS {};".format(udf_name) + # Test that dropping the FUNCTION reflects in the catalog + drop_query = "DROP FUNCTION IF EXISTS {};".format(function_name) execute_query_fetch_all(self.evadb, drop_query) - udf = self.evadb.catalog().get_udf_catalog_entry_by_name(udf_name) - self.assertTrue(udf is None) - - def test_drop_wrong_udf_name(self): - self.run_create_udf_query() - right_udf_name = "DummyObjectDetector" - wrong_udf_name = "FakeDummyObjectDetector" - udf = self.evadb.catalog().get_udf_catalog_entry_by_name(right_udf_name) - self.assertTrue(udf is not None) - - # Test that dropping the wrong UDF: - # - does not affect UDFs in the catalog + function = self.evadb.catalog().get_function_catalog_entry_by_name( + function_name + ) + self.assertTrue(function is None) + + def test_drop_wrong_function_name(self): + self.run_create_function_query() + right_function_name = "DummyObjectDetector" + wrong_function_name = "FakeDummyObjectDetector" + function = self.evadb.catalog().get_function_catalog_entry_by_name( + right_function_name + ) + self.assertTrue(function is not None) + + # Test that dropping the wrong FUNCTION: + # - does not affect FUNCTIONs in the catalog # - raises an appropriate exception - drop_query = "DROP UDF {};".format(wrong_udf_name) + drop_query = "DROP FUNCTION {};".format(wrong_function_name) try: execute_query_fetch_all( self.evadb, drop_query, do_not_print_exceptions=True ) except Exception as e: - err_msg = "UDF {} does not exist, therefore cannot be dropped.".format( - wrong_udf_name + err_msg = "Function {} does not exist, therefore cannot be dropped.".format( + wrong_function_name ) self.assertTrue(str(e) == err_msg) - udf = self.evadb.catalog().get_udf_catalog_entry_by_name(right_udf_name) - self.assertTrue(udf is not None) + function = self.evadb.catalog().get_function_catalog_entry_by_name( + right_function_name + ) + self.assertTrue(function is not None) #### DROP INDEX @@ -167,8 +175,8 @@ def test_should_drop_index(self): # Test that dropping the wrong Index: # - does not affect Indexes in the catalog # - raises an appropriate exception - wrong_udf_name = "wrong_udf_name" - drop_query = f"DROP INDEX {wrong_udf_name};" + wrong_function_name = "wrong_function_name" + drop_query = f"DROP INDEX {wrong_function_name};" with self.assertRaises(ExecutorError): execute_query_fetch_all( self.evadb, drop_query, do_not_print_exceptions=True diff --git a/test/integration_tests/short/test_generic_utils.py b/test/integration_tests/short/test_generic_utils.py index b3bd75db16..bc78af0d8a 100644 --- a/test/integration_tests/short/test_generic_utils.py +++ b/test/integration_tests/short/test_generic_utils.py @@ -21,7 +21,7 @@ from evadb.utils.generic_utils import ( generate_file_path, is_gpu_available, - load_udf_class_from_file, + load_function_class_from_file, str_to_class, validate_kwargs, ) @@ -37,31 +37,33 @@ def test_should_return_correct_class_for_string(self): self.assertEqual(vl, DecordReader) def test_should_return_correct_class_for_path(self): - vl = load_udf_class_from_file("evadb/readers/decord_reader.py", "DecordReader") + vl = load_function_class_from_file( + "evadb/readers/decord_reader.py", "DecordReader" + ) # Can't check that v1 = DecordReader because the above function returns decord_reader.DecordReader instead of evadb.readers.decord_reader.DecordReader # So we check the qualname instead, qualname is the path to the class including the module name # Ref: https://peps.python.org/pep-3155/#rationale assert vl.__qualname__ == DecordReader.__qualname__ def test_should_return_correct_class_for_path_without_classname(self): - vl = load_udf_class_from_file("evadb/readers/decord_reader.py") + vl = load_function_class_from_file("evadb/readers/decord_reader.py") assert vl.__qualname__ == DecordReader.__qualname__ def test_should_raise_on_missing_file(self): with self.assertRaises(RuntimeError): - load_udf_class_from_file("evadb/readers/opencv_reader_abdfdsfds.py") + load_function_class_from_file("evadb/readers/opencv_reader_abdfdsfds.py") def test_should_raise_if_class_does_not_exists(self): with self.assertRaises(RuntimeError): # evadb/utils/s3_utils.py has no class in it # if this test fails due to change in s3_utils.py, change the file to something else - load_udf_class_from_file("evadb/utils/s3_utils.py") + load_function_class_from_file("evadb/utils/s3_utils.py") def test_should_raise_if_multiple_classes_exist_and_no_class_mentioned(self): with self.assertRaises(RuntimeError): # evadb/utils/generic_utils.py has multiple classes in it # if this test fails due to change in generic_utils.py, change the file to something else - load_udf_class_from_file("evadb/utils/generic_utils.py") + load_function_class_from_file("evadb/utils/generic_utils.py") def test_should_use_torch_to_check_if_gpu_is_available(self): # Emulate a missing import diff --git a/test/integration_tests/short/test_select_executor.py b/test/integration_tests/short/test_select_executor.py index 10c0d5c030..3b97ea9546 100644 --- a/test/integration_tests/short/test_select_executor.py +++ b/test/integration_tests/short/test_select_executor.py @@ -21,7 +21,7 @@ file_remove, get_evadb_for_testing, get_logical_query_plan, - load_udfs_for_testing, + load_functions_for_testing, shutdown_ray, ) @@ -47,7 +47,7 @@ def setUpClass(cls): load_query = f"LOAD VIDEO '{video_file_path}' INTO MyVideo;" execute_query_fetch_all(cls.evadb, load_query) - load_udfs_for_testing(cls.evadb) + load_functions_for_testing(cls.evadb) cls.table1 = create_table(cls.evadb, "table1", 100, 3) cls.table2 = create_table(cls.evadb, "table2", 500, 3) @@ -315,14 +315,14 @@ def test_lateral_join_with_unnest_and_sample(self): self.assertEqual(unnest_batch, expected) def test_should_raise_error_with_missing_alias_in_lateral_join(self): - udf_name = "DummyMultiObjectDetector" + function_name = "DummyMultiObjectDetector" query = """SELECT id, labels FROM MyVideo JOIN LATERAL DummyMultiObjectDetector(data).labels;""" with self.assertRaises(SyntaxError) as cm: execute_query_fetch_all(self.evadb, query, do_not_print_exceptions=True) self.assertEqual( str(cm.exception), - f"TableValuedFunction {udf_name} should have alias.", + f"TableValuedFunction {function_name} should have alias.", ) query = """SELECT id, labels @@ -332,7 +332,7 @@ def test_should_raise_error_with_missing_alias_in_lateral_join(self): execute_query_fetch_all(self.evadb, query) self.assertEqual( str(cm.exception), - f"TableValuedFunction {udf_name} should have alias.", + f"TableValuedFunction {function_name} should have alias.", ) query = """SELECT id, labels @@ -341,11 +341,11 @@ def test_should_raise_error_with_missing_alias_in_lateral_join(self): execute_query_fetch_all(self.evadb, query) self.assertEqual( str(cm.exception), - f"TableValuedFunction {udf_name} should have alias.", + f"TableValuedFunction {function_name} should have alias.", ) def test_should_raise_error_with_invalid_number_of_aliases(self): - udf_name = "DummyMultiObjectDetector" + function_name = "DummyMultiObjectDetector" query = """SELECT id, labels FROM MyVideo JOIN LATERAL DummyMultiObjectDetector(data).bboxes AS T;""" @@ -353,7 +353,7 @@ def test_should_raise_error_with_invalid_number_of_aliases(self): execute_query_fetch_all(self.evadb, query) self.assertEqual( str(cm.exception), - f"Output bboxes does not exist for {udf_name}.", + f"Output bboxes does not exist for {function_name}.", ) def test_should_raise_error_with_invalid_output_lateral_join(self): @@ -407,9 +407,9 @@ def test_expression_tree_signature(self): self.evadb, "SELECT DummyMultiObjectDetector(data).labels FROM MyVideo" ) signature = plan.target_list[0].signature() - udf_id = ( + function_id = ( self.evadb.catalog() - .get_udf_catalog_entry_by_name("DummyMultiObjectDetector") + .get_function_catalog_entry_by_name("DummyMultiObjectDetector") .row_id ) table_entry = self.evadb.catalog().get_table_catalog_entry("MyVideo") @@ -417,5 +417,6 @@ def test_expression_tree_signature(self): self.evadb.catalog().get_column_catalog_entry(table_entry, "data").row_id ) self.assertEqual( - signature, f"DummyMultiObjectDetector[{udf_id}](MyVideo.data[{col_id}])" + signature, + f"DummyMultiObjectDetector[{function_id}](MyVideo.data[{col_id}])", ) diff --git a/test/integration_tests/short/test_show_info_executor.py b/test/integration_tests/short/test_show_info_executor.py index 7ec3efa963..da02a98ad7 100644 --- a/test/integration_tests/short/test_show_info_executor.py +++ b/test/integration_tests/short/test_show_info_executor.py @@ -21,9 +21,12 @@ import pytest from evadb.configuration.constants import EvaDB_ROOT_DIR +from evadb.functions.function_bootstrap_queries import ( + ArrayCount_function_query, + Fastrcnn_function_query, +) from evadb.models.storage.batch import Batch from evadb.server.command_handler import execute_query_fetch_all -from evadb.udfs.udf_bootstrap_queries import ArrayCount_udf_query, Fastrcnn_udf_query NUM_FRAMES = 10 @@ -34,7 +37,7 @@ class ShowExecutorTest(unittest.TestCase): def setUpClass(cls): cls.evadb = get_evadb_for_testing() cls.evadb.catalog().reset() - queries = [Fastrcnn_udf_query, ArrayCount_udf_query] + queries = [Fastrcnn_function_query, ArrayCount_function_query] for query in queries: execute_query_fetch_all(cls.evadb, query) @@ -52,13 +55,13 @@ def tearDownClass(cls): execute_query_fetch_all(cls.evadb, "DROP TABLE IF EXISTS MyVideo;") # integration test - def test_show_udfs(self): - result = execute_query_fetch_all(self.evadb, "SHOW UDFS;") + def test_show_functions(self): + result = execute_query_fetch_all(self.evadb, "SHOW FUNCTIONS;") self.assertEqual(len(result.columns), 6) expected = { "name": ["FastRCNNObjectDetector", "ArrayCount"], - "type": ["Classification", "NdarrayUDF"], + "type": ["Classification", "NdarrayFunction"], } expected_df = pd.DataFrame(expected) self.assertTrue(all(expected_df.name == result.frames.name)) diff --git a/test/integration_tests/short/test_use_executor.py b/test/integration_tests/short/test_use_executor.py new file mode 100644 index 0000000000..6f941e2488 --- /dev/null +++ b/test/integration_tests/short/test_use_executor.py @@ -0,0 +1,43 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.util import get_evadb_for_testing, shutdown_ray + +from evadb.executor.executor_utils import ExecutorError +from evadb.server.command_handler import execute_query_fetch_all + + +class CreateDatabaseTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + cls.evadb.catalog().reset() + + @classmethod + def tearDownClass(cls): + shutdown_ray() + + def test_use_should_raise_executor_error(self): + query = """USE not_available_ds { + SELECT * FROM table + }""" + + with self.assertRaises(ExecutorError): + execute_query_fetch_all(self.evadb, query) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/markers.py b/test/markers.py index 4e1aa839d5..ae5b6acebb 100644 --- a/test/markers.py +++ b/test/markers.py @@ -19,6 +19,7 @@ import pytest from evadb.utils.generic_utils import ( + is_forecast_available, is_gpu_available, is_ludwig_available, is_qdrant_available, @@ -73,3 +74,8 @@ chatgpt_skip_marker = pytest.mark.skip( reason="requires chatgpt", ) + +forecast_skip_marker = pytest.mark.skipif( + is_forecast_available() is False, + reason="Run only if forecasting packages available", +) diff --git a/test/third_party_tests/test_native_executor.py b/test/third_party_tests/test_native_executor.py index 8ac803fd57..7259f4ef03 100644 --- a/test/third_party_tests/test_native_executor.py +++ b/test/third_party_tests/test_native_executor.py @@ -17,6 +17,7 @@ import pytest +from evadb.executor.executor_utils import ExecutorError from evadb.server.command_handler import execute_query_fetch_all @@ -63,6 +64,25 @@ def _drop_table_in_native_database(self): }""", ) + def _create_evadb_table_using_select_query(self): + execute_query_fetch_all( + self.evadb, + """CREATE TABLE eva_table AS SELECT name, age FROM test_data_source.test_table;""", + ) + + # check if the create table is successful + res_batch = execute_query_fetch_all(self.evadb, "Select * from eva_table") + self.assertEqual(len(res_batch), 2) + self.assertEqual(res_batch.frames["eva_table.name"][0], "aa") + self.assertEqual(res_batch.frames["eva_table.age"][0], 1) + self.assertEqual(res_batch.frames["eva_table.name"][1], "bb") + self.assertEqual(res_batch.frames["eva_table.age"][1], 2) + + execute_query_fetch_all( + self.evadb, + "DROP TABLE IF EXISTS eva_table;", + ) + def _execute_evadb_query(self): self._create_table_in_native_database() self._insert_value_into_native_database("aa", 1, "aaaa") @@ -78,6 +98,7 @@ def _execute_evadb_query(self): self.assertEqual(res_batch.frames["test_table.name"][1], "bb") self.assertEqual(res_batch.frames["test_table.age"][1], 2) + self._create_evadb_table_using_select_query() self._drop_table_in_native_database() def _execute_native_query(self): @@ -91,12 +112,41 @@ def _execute_native_query(self): }""", ) self.assertEqual(len(res_batch), 1) + self.assertEqual(res_batch.frames["name"][0], "aa") self.assertEqual(res_batch.frames["age"][0], 1) self.assertEqual(res_batch.frames["comment"][0], "aaaa") self._drop_table_in_native_database() + def _raise_error_on_multiple_creation(self): + params = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb", + } + query = f"""CREATE DATABASE test_data_source + WITH ENGINE = "postgres", + PARAMETERS = {params};""" + with self.assertRaises(ExecutorError): + execute_query_fetch_all(self.evadb, query) + + def _raise_error_on_invalid_connection(self): + params = { + "user": "xxxxxx", + "password": "xxxxxx", + "host": "localhost", + "port": "5432", + "database": "evadb", + } + query = f"""CREATE DATABASE invaid + WITH ENGINE = "postgres", + PARAMETERS = {params};""" + with self.assertRaises(ExecutorError): + execute_query_fetch_all(self.evadb, query) + def test_should_run_query_in_postgres(self): # Create database. params = { @@ -114,3 +164,43 @@ def test_should_run_query_in_postgres(self): # Test executions. self._execute_native_query() self._execute_evadb_query() + + # Test error. + self._raise_error_on_multiple_creation() + self._raise_error_on_invalid_connection() + + def test_should_run_query_in_sqlite(self): + # Create database. + params = { + "database": "evadb.db", + } + query = f"""CREATE DATABASE test_data_source + WITH ENGINE = "sqlite", + PARAMETERS = {params};""" + execute_query_fetch_all(self.evadb, query) + + # Test executions. + self._execute_native_query() + self._execute_evadb_query() + + def test_should_run_query_in_mysql(self): + # Create database. + params = { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "3306", + "database": "evadb", + } + query = f"""CREATE DATABASE test_data_source + WITH ENGINE = "mysql", + PARAMETERS = {params};""" + execute_query_fetch_all(self.evadb, query) + + # Test executions. + self._execute_native_query() + self._execute_evadb_query() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/unit_tests/binder/test_statement_binder.py b/test/unit_tests/binder/test_statement_binder.py index 5a70beed2c..f449596cf0 100644 --- a/test/unit_tests/binder/test_statement_binder.py +++ b/test/unit_tests/binder/test_statement_binder.py @@ -152,7 +152,7 @@ def test_bind_func_expr_with_star(self): binderContext = MagicMock() tvp1 = ("T", "col1") tvp2 = ("T", "col2") - binderContext._catalog.return_value.get_udf_catalog_entry_by_name.return_value = ( + binderContext._catalog.return_value.get_function_catalog_entry_by_name.return_value = ( None ) binderContext._get_all_alias_and_col_name.return_value = [tvp1, tvp2] @@ -169,8 +169,8 @@ def test_bind_func_expr_with_star(self): call2.args[0], TupleValueExpression(name=tvp2[1], table_alias=tvp2[0]) ) - @patch("evadb.binder.statement_binder.load_udf_class_from_file") - def test_bind_func_expr(self, mock_load_udf_class_from_file): + @patch("evadb.binder.statement_binder.load_function_class_from_file") + def test_bind_func_expr(self, mock_load_function_class_from_file): # setup func_expr = MagicMock( name="func_expr", alias=Alias("func_expr"), output_col_aliases=[] @@ -181,38 +181,38 @@ def test_bind_func_expr(self, mock_load_udf_class_from_file): obj2 = MagicMock() obj2.name.lower.return_value = "out2" func_output_objs = [obj1, obj2] - udf_obj = MagicMock() + function_obj = MagicMock() mock_catalog = MagicMock() - mock_get_name = mock_catalog().get_udf_catalog_entry_by_name = MagicMock() - mock_get_name.return_value = udf_obj + mock_get_name = mock_catalog().get_function_catalog_entry_by_name = MagicMock() + mock_get_name.return_value = function_obj - mock_get_udf_outputs = ( - mock_catalog().get_udf_io_catalog_output_entries + mock_get_function_outputs = ( + mock_catalog().get_function_io_catalog_output_entries ) = MagicMock() - mock_get_udf_outputs.return_value = func_output_objs - mock_load_udf_class_from_file.return_value.return_value = ( - "load_udf_class_from_file" + mock_get_function_outputs.return_value = func_output_objs + mock_load_function_class_from_file.return_value.return_value = ( + "load_function_class_from_file" ) - # mock_get_file_checksum.return_value = udf_obj.checksum + # mock_get_file_checksum.return_value = function_obj.checksum # Case 1 set output func_expr.output = "out1" binder = StatementBinder(StatementBinderContext(mock_catalog)) binder._bind_func_expr(func_expr) - # mock_get_file_checksum.assert_called_with(udf_obj.impl_file_path) + # mock_get_file_checksum.assert_called_with(function_obj.impl_file_path) mock_get_name.assert_called_with(func_expr.name) - mock_get_udf_outputs.assert_called_with(udf_obj) - mock_load_udf_class_from_file.assert_called_with( - udf_obj.impl_file_path, udf_obj.name + mock_get_function_outputs.assert_called_with(function_obj) + mock_load_function_class_from_file.assert_called_with( + function_obj.impl_file_path, function_obj.name ) self.assertEqual(func_expr.output_objs, [obj1]) self.assertEqual( func_expr.alias, Alias("func_expr", ["out1"]), ) - self.assertEqual(func_expr.function(), "load_udf_class_from_file") + self.assertEqual(func_expr.function(), "load_function_class_from_file") # Case 2 output not set func_expr.output = None @@ -220,11 +220,11 @@ def test_bind_func_expr(self, mock_load_udf_class_from_file): binder = StatementBinder(StatementBinderContext(mock_catalog)) binder._bind_func_expr(func_expr) - # mock_get_file_checksum.assert_called_with(udf_obj.impl_file_path) + # mock_get_file_checksum.assert_called_with(function_obj.impl_file_path) mock_get_name.assert_called_with(func_expr.name) - mock_get_udf_outputs.assert_called_with(udf_obj) - mock_load_udf_class_from_file.assert_called_with( - udf_obj.impl_file_path, udf_obj.name + mock_get_function_outputs.assert_called_with(function_obj) + mock_load_function_class_from_file.assert_called_with( + function_obj.impl_file_path, function_obj.name ) self.assertEqual(func_expr.output_objs, func_output_objs) self.assertEqual( @@ -234,20 +234,20 @@ def test_bind_func_expr(self, mock_load_udf_class_from_file): ["out1", "out2"], ), ) - self.assertEqual(func_expr.function(), "load_udf_class_from_file") + self.assertEqual(func_expr.function(), "load_function_class_from_file") # Raise error if the class object cannot be created - mock_load_udf_class_from_file.reset_mock() - mock_error_msg = "mock_load_udf_class_from_file_error" - mock_load_udf_class_from_file.side_effect = MagicMock( + mock_load_function_class_from_file.reset_mock() + mock_error_msg = "mock_load_function_class_from_file_error" + mock_load_function_class_from_file.side_effect = MagicMock( side_effect=RuntimeError(mock_error_msg) ) binder = StatementBinder(StatementBinderContext(mock_catalog)) with self.assertRaises(BinderError) as cm: binder._bind_func_expr(func_expr) err_msg = ( - f"{mock_error_msg}. Please verify that the UDF class name in the " - "implementation file matches the UDF name." + f"{mock_error_msg}. Please verify that the function class name in the " + "implementation file matches the function name." ) self.assertEqual(str(cm.exception), err_msg) @@ -308,12 +308,14 @@ def test_bind_create_index(self): binder._bind_create_index_statement(create_index_statement) create_index_statement.col_list = ["foo"] - udf_obj = MagicMock() + function_obj = MagicMock() output = MagicMock() - udf_obj.outputs = [output] + function_obj.outputs = [output] with patch.object( - catalog(), "get_udf_catalog_entry_by_name", return_value=udf_obj + catalog(), + "get_function_catalog_entry_by_name", + return_value=function_obj, ): with self.assertRaises(AssertionError): binder._bind_create_index_statement(create_index_statement) @@ -323,7 +325,7 @@ def test_bind_create_index(self): output.array_dimensions = [1, 100] binder._bind_create_index_statement(create_index_statement) - create_index_statement.udf_func = None + create_index_statement.function = None col_def = MagicMock() col_def.name = "a" create_index_statement.col_list = [col_def] @@ -339,18 +341,18 @@ def test_bind_create_index(self): col.array_dimensions = [1, 10] binder._bind_create_index_statement(create_index_statement) - def test_bind_create_udf_should_raise(self): + def test_bind_create_function_should_raise(self): with patch.object(StatementBinder, "bind"): - create_udf_statement = MagicMock() - create_udf_statement.query.target_list = [] - create_udf_statement.metadata = [] + create_function_statement = MagicMock() + create_function_statement.query.target_list = [] + create_function_statement.metadata = [] binder = StatementBinder(StatementBinderContext(MagicMock())) with self.assertRaises(AssertionError): - binder._bind_create_udf_statement(create_udf_statement) + binder._bind_create_function_statement(create_function_statement) - def test_bind_create_udf_should_drop_row_id(self): + def test_bind_create_function_should_drop_row_id(self): with patch.object(StatementBinder, "bind"): - create_udf_statement = MagicMock() + create_function_statement = MagicMock() row_id_col_obj = ColumnCatalogEntry( name=IDENTIFIER_COLUMN, type=MagicMock(), @@ -369,7 +371,7 @@ def test_bind_create_udf_should_drop_row_id(self): array_type=MagicMock(), array_dimensions=MagicMock(), ) - create_udf_statement.query.target_list = [ + create_function_statement.query.target_list = [ TupleValueExpression( name=IDENTIFIER_COLUMN, table_alias="a", col_object=row_id_col_obj ), @@ -380,12 +382,12 @@ def test_bind_create_udf_should_drop_row_id(self): name="predict_column", table_alias="a", col_object=output_col_obj ), ] - create_udf_statement.metadata = [("predict", "predict_column")] + create_function_statement.metadata = [("predict", "predict_column")] binder = StatementBinder(StatementBinderContext(MagicMock())) - binder._bind_create_udf_statement(create_udf_statement) + binder._bind_create_function_statement(create_function_statement) self.assertEqual( - create_udf_statement.query.target_list, + create_function_statement.query.target_list, [ TupleValueExpression( name="input_column", table_alias="a", col_object=input_col_obj @@ -414,5 +416,5 @@ def test_bind_create_udf_should_drop_row_id(self): output_col_obj.array_dimensions, ) ] - self.assertEqual(create_udf_statement.inputs, expected_inputs) - self.assertEqual(create_udf_statement.outputs, expected_outputs) + self.assertEqual(create_function_statement.inputs, expected_inputs) + self.assertEqual(create_function_statement.outputs, expected_outputs) diff --git a/test/unit_tests/catalog/models/test_models.py b/test/unit_tests/catalog/models/test_models.py index 02e865e929..524505db5f 100644 --- a/test/unit_tests/catalog/models/test_models.py +++ b/test/unit_tests/catalog/models/test_models.py @@ -16,10 +16,10 @@ from evadb.catalog.catalog_type import ColumnType, NdArrayType, TableType from evadb.catalog.models.column_catalog import ColumnCatalogEntry +from evadb.catalog.models.function_catalog import FunctionCatalogEntry +from evadb.catalog.models.function_io_catalog import FunctionIOCatalogEntry from evadb.catalog.models.index_catalog import IndexCatalogEntry from evadb.catalog.models.table_catalog import TableCatalogEntry -from evadb.catalog.models.udf_catalog import UdfCatalogEntry -from evadb.catalog.models.udf_io_catalog import UdfIOCatalogEntry class CatalogModelsTest(unittest.TestCase): @@ -77,72 +77,86 @@ def test_table_catalog_entry_equality(self): self.assertNotEqual(table_catalog_entry, table_catalog_entry1) - def test_udf(self): - udf = UdfCatalogEntry("udf", "fasterRCNN", "ObjectDetection", "checksum") - self.assertEqual(udf.row_id, None) - self.assertEqual(udf.impl_file_path, "fasterRCNN") - self.assertEqual(udf.name, "udf") - self.assertEqual(udf.type, "ObjectDetection") - self.assertEqual(udf.checksum, "checksum") - - def test_udf_hash(self): - udf1 = UdfCatalogEntry("udf", "fasterRCNN", "ObjectDetection", "checksum") - udf2 = UdfCatalogEntry("udf", "fasterRCNN", "ObjectDetection", "checksum") - - self.assertEqual(hash(udf1), hash(udf2)) - - def test_udf_equality(self): - udf = UdfCatalogEntry("udf", "fasterRCNN", "ObjectDetection", "checksum") - self.assertEqual(udf, udf) - udf2 = UdfCatalogEntry("udf2", "fasterRCNN", "ObjectDetection", "checksum") - self.assertNotEqual(udf, udf2) - udf3 = UdfCatalogEntry("udf", "fasterRCNN2", "ObjectDetection", "checksum") - self.assertNotEqual(udf, udf3) - udf4 = UdfCatalogEntry("udf2", "fasterRCNN", "ObjectDetection3", "checksum") - self.assertNotEqual(udf, udf4) - - def test_udf_io(self): - udf_io = UdfIOCatalogEntry( + def test_function(self): + function = FunctionCatalogEntry( + "function", "fasterRCNN", "ObjectDetection", "checksum" + ) + self.assertEqual(function.row_id, None) + self.assertEqual(function.impl_file_path, "fasterRCNN") + self.assertEqual(function.name, "function") + self.assertEqual(function.type, "ObjectDetection") + self.assertEqual(function.checksum, "checksum") + + def test_function_hash(self): + function1 = FunctionCatalogEntry( + "function", "fasterRCNN", "ObjectDetection", "checksum" + ) + function2 = FunctionCatalogEntry( + "function", "fasterRCNN", "ObjectDetection", "checksum" + ) + + self.assertEqual(hash(function1), hash(function2)) + + def test_function_equality(self): + function = FunctionCatalogEntry( + "function", "fasterRCNN", "ObjectDetection", "checksum" + ) + self.assertEqual(function, function) + function2 = FunctionCatalogEntry( + "function2", "fasterRCNN", "ObjectDetection", "checksum" + ) + self.assertNotEqual(function, function2) + function3 = FunctionCatalogEntry( + "function", "fasterRCNN2", "ObjectDetection", "checksum" + ) + self.assertNotEqual(function, function3) + function4 = FunctionCatalogEntry( + "function2", "fasterRCNN", "ObjectDetection3", "checksum" + ) + self.assertNotEqual(function, function4) + + def test_function_io(self): + function_io = FunctionIOCatalogEntry( "name", ColumnType.NDARRAY, True, NdArrayType.UINT8, [2, 3], True, 1 ) - self.assertEqual(udf_io.row_id, None) - self.assertEqual(udf_io.udf_id, 1) - self.assertEqual(udf_io.is_input, True) - self.assertEqual(udf_io.is_nullable, True) - self.assertEqual(udf_io.array_type, NdArrayType.UINT8) - self.assertEqual(udf_io.array_dimensions, [2, 3]) - self.assertEqual(udf_io.name, "name") - self.assertEqual(udf_io.type, ColumnType.NDARRAY) - - def test_udf_io_equality(self): - udf_io = UdfIOCatalogEntry( + self.assertEqual(function_io.row_id, None) + self.assertEqual(function_io.function_id, 1) + self.assertEqual(function_io.is_input, True) + self.assertEqual(function_io.is_nullable, True) + self.assertEqual(function_io.array_type, NdArrayType.UINT8) + self.assertEqual(function_io.array_dimensions, [2, 3]) + self.assertEqual(function_io.name, "name") + self.assertEqual(function_io.type, ColumnType.NDARRAY) + + def test_function_io_equality(self): + function_io = FunctionIOCatalogEntry( "name", ColumnType.FLOAT, True, None, [2, 3], True, 1 ) - self.assertEqual(udf_io, udf_io) - udf_io2 = UdfIOCatalogEntry( + self.assertEqual(function_io, function_io) + function_io2 = FunctionIOCatalogEntry( "name2", ColumnType.FLOAT, True, None, [2, 3], True, 1 ) - self.assertNotEqual(udf_io, udf_io2) - udf_io2 = UdfIOCatalogEntry( + self.assertNotEqual(function_io, function_io2) + function_io2 = FunctionIOCatalogEntry( "name", ColumnType.INTEGER, True, None, [2, 3], True, 1 ) - self.assertNotEqual(udf_io, udf_io2) - udf_io2 = UdfIOCatalogEntry( + self.assertNotEqual(function_io, function_io2) + function_io2 = FunctionIOCatalogEntry( "name", ColumnType.FLOAT, False, None, [2, 3], True, 1 ) - self.assertNotEqual(udf_io, udf_io2) - udf_io2 = UdfIOCatalogEntry( + self.assertNotEqual(function_io, function_io2) + function_io2 = FunctionIOCatalogEntry( "name", ColumnType.FLOAT, True, None, [2, 3, 4], True, 1 ) - self.assertNotEqual(udf_io, udf_io2) - udf_io2 = UdfIOCatalogEntry( + self.assertNotEqual(function_io, function_io2) + function_io2 = FunctionIOCatalogEntry( "name", ColumnType.FLOAT, True, None, [2, 3], False, 1 ) - self.assertNotEqual(udf_io, udf_io2) - udf_io2 = UdfIOCatalogEntry( + self.assertNotEqual(function_io, function_io2) + function_io2 = FunctionIOCatalogEntry( "name", ColumnType.FLOAT, True, None, [2, 3], True, 2 ) - self.assertNotEqual(udf_io, udf_io2) + self.assertNotEqual(function_io, function_io2) def test_index(self): index = IndexCatalogEntry("index", "FaissSavePath", "HNSW") diff --git a/test/unit_tests/catalog/test_catalog_manager.py b/test/unit_tests/catalog/test_catalog_manager.py index 65ea406b76..3c0e23a3cd 100644 --- a/test/unit_tests/catalog/test_catalog_manager.py +++ b/test/unit_tests/catalog/test_catalog_manager.py @@ -22,7 +22,7 @@ from evadb.catalog.catalog_type import ColumnType, TableType from evadb.catalog.catalog_utils import get_video_table_column_definitions from evadb.catalog.models.column_catalog import ColumnCatalogEntry -from evadb.catalog.models.udf_catalog import UdfCatalogEntry +from evadb.catalog.models.function_catalog import FunctionCatalogEntry from evadb.parser.table_ref import TableInfo from evadb.parser.types import FileFormatType @@ -125,55 +125,63 @@ def test_get_table_catalog_entry_when_table_doesnot_exists( dcs_mock.return_value.filter_entries_by_table_id.assert_not_called() self.assertEqual(actual, table_obj) - @mock.patch("evadb.catalog.catalog_manager.UdfCatalogService") - @mock.patch("evadb.catalog.catalog_manager.UdfIOCatalogService") - @mock.patch("evadb.catalog.catalog_manager.UdfMetadataCatalogService") + @mock.patch("evadb.catalog.catalog_manager.FunctionCatalogService") + @mock.patch("evadb.catalog.catalog_manager.FunctionIOCatalogService") + @mock.patch("evadb.catalog.catalog_manager.FunctionMetadataCatalogService") @mock.patch("evadb.catalog.catalog_manager.get_file_checksum") - def test_insert_udf(self, checksum_mock, udfmetadata_mock, udfio_mock, udf_mock): + def test_insert_function( + self, checksum_mock, functionmetadata_mock, functionio_mock, function_mock + ): catalog = CatalogManager(MagicMock(), MagicMock()) - udf_io_list = [MagicMock()] - udf_metadata_list = [MagicMock()] - actual = catalog.insert_udf_catalog_entry( - "udf", "sample.py", "classification", udf_io_list, udf_metadata_list + function_io_list = [MagicMock()] + function_metadata_list = [MagicMock()] + actual = catalog.insert_function_catalog_entry( + "function", + "sample.py", + "classification", + function_io_list, + function_metadata_list, ) - udfio_mock.return_value.insert_entries.assert_called_with(udf_io_list) - udfmetadata_mock.return_value.insert_entries.assert_called_with( - udf_metadata_list + functionio_mock.return_value.insert_entries.assert_called_with(function_io_list) + functionmetadata_mock.return_value.insert_entries.assert_called_with( + function_metadata_list ) - udf_mock.return_value.insert_entry.assert_called_with( - "udf", "sample.py", "classification", checksum_mock.return_value + function_mock.return_value.insert_entry.assert_called_with( + "function", "sample.py", "classification", checksum_mock.return_value ) checksum_mock.assert_called_with("sample.py") - self.assertEqual(actual, udf_mock.return_value.insert_entry.return_value) + self.assertEqual(actual, function_mock.return_value.insert_entry.return_value) - @mock.patch("evadb.catalog.catalog_manager.UdfCatalogService") - def test_get_udf_catalog_entry_by_name(self, udf_mock): + @mock.patch("evadb.catalog.catalog_manager.FunctionCatalogService") + def test_get_function_catalog_entry_by_name(self, function_mock): catalog = CatalogManager(MagicMock(), MagicMock()) - actual = catalog.get_udf_catalog_entry_by_name("name") - udf_mock.return_value.get_entry_by_name.assert_called_with("name") - self.assertEqual(actual, udf_mock.return_value.get_entry_by_name.return_value) + actual = catalog.get_function_catalog_entry_by_name("name") + function_mock.return_value.get_entry_by_name.assert_called_with("name") + self.assertEqual( + actual, function_mock.return_value.get_entry_by_name.return_value + ) - @mock.patch("evadb.catalog.catalog_manager.UdfCatalogService") - def test_delete_udf(self, udf_mock): - CatalogManager(MagicMock(), MagicMock()).delete_udf_catalog_entry_by_name( + @mock.patch("evadb.catalog.catalog_manager.FunctionCatalogService") + def test_delete_function(self, function_mock): + CatalogManager(MagicMock(), MagicMock()).delete_function_catalog_entry_by_name( "name" ) - udf_mock.return_value.delete_entry_by_name.assert_called_with("name") - - @mock.patch("evadb.catalog.catalog_manager.UdfIOCatalogService") - def test_get_udf_outputs(self, udf_mock): - mock_func = udf_mock.return_value.get_output_entries_by_udf_id - udf_obj = MagicMock(spec=UdfCatalogEntry) - CatalogManager(MagicMock(), MagicMock()).get_udf_io_catalog_output_entries( - udf_obj + function_mock.return_value.delete_entry_by_name.assert_called_with("name") + + @mock.patch("evadb.catalog.catalog_manager.FunctionIOCatalogService") + def test_get_function_outputs(self, function_mock): + mock_func = function_mock.return_value.get_output_entries_by_function_id + function_obj = MagicMock(spec=FunctionCatalogEntry) + CatalogManager(MagicMock(), MagicMock()).get_function_io_catalog_output_entries( + function_obj ) - mock_func.assert_called_once_with(udf_obj.row_id) - - @mock.patch("evadb.catalog.catalog_manager.UdfIOCatalogService") - def test_get_udf_inputs(self, udf_mock): - mock_func = udf_mock.return_value.get_input_entries_by_udf_id - udf_obj = MagicMock(spec=UdfCatalogEntry) - CatalogManager(MagicMock(), MagicMock()).get_udf_io_catalog_input_entries( - udf_obj + mock_func.assert_called_once_with(function_obj.row_id) + + @mock.patch("evadb.catalog.catalog_manager.FunctionIOCatalogService") + def test_get_function_inputs(self, function_mock): + mock_func = function_mock.return_value.get_input_entries_by_function_id + function_obj = MagicMock(spec=FunctionCatalogEntry) + CatalogManager(MagicMock(), MagicMock()).get_function_io_catalog_input_entries( + function_obj ) - mock_func.assert_called_once_with(udf_obj.row_id) + mock_func.assert_called_once_with(function_obj.row_id) diff --git a/test/unit_tests/executor/test_create_udf_executor.py b/test/unit_tests/executor/test_create_udf_executor.py index 2b341a84b7..1611f31e07 100644 --- a/test/unit_tests/executor/test_create_udf_executor.py +++ b/test/unit_tests/executor/test_create_udf_executor.py @@ -17,87 +17,88 @@ from mock import MagicMock, patch from evadb.catalog.catalog_type import NdArrayType -from evadb.executor.create_udf_executor import CreateUDFExecutor -from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe +from evadb.executor.create_function_executor import CreateFunctionExecutor +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe -class CreateUdfExecutorTest(unittest.TestCase): - @patch("evadb.executor.create_udf_executor.load_udf_class_from_file") - def test_should_create_udf(self, load_udf_class_from_file_mock): +class CreateFunctionExecutorTest(unittest.TestCase): + @patch("evadb.executor.create_function_executor.load_function_class_from_file") + def test_should_create_function(self, load_function_class_from_file_mock): catalog_instance = MagicMock() - catalog_instance().get_udf_catalog_entry_by_name.return_value = None - catalog_instance().insert_udf_catalog_entry.return_value = "udf" + catalog_instance().get_function_catalog_entry_by_name.return_value = None + catalog_instance().insert_function_catalog_entry.return_value = "function" impl_path = MagicMock() abs_path = impl_path.absolute.return_value = MagicMock() abs_path.as_posix.return_value = "test.py" - load_udf_class_from_file_mock.return_value.return_value = "mock_class" + load_function_class_from_file_mock.return_value.return_value = "mock_class" plan = type( - "CreateUDFPlan", + "CreateFunctionPlan", (), { - "name": "udf", + "name": "function", "if_not_exists": False, "inputs": ["inp"], "outputs": ["out"], "impl_path": impl_path, - "udf_type": "classification", + "function_type": "classification", "metadata": {"key1": "value1", "key2": "value2"}, }, ) evadb = MagicMock evadb.catalog = catalog_instance evadb.config = MagicMock() - create_udf_executor = CreateUDFExecutor(evadb, plan) - next(create_udf_executor.exec()) - catalog_instance().insert_udf_catalog_entry.assert_called_with( - "udf", + create_function_executor = CreateFunctionExecutor(evadb, plan) + next(create_function_executor.exec()) + catalog_instance().insert_function_catalog_entry.assert_called_with( + "function", "test.py", "classification", ["inp", "out"], {"key1": "value1", "key2": "value2"}, ) - @patch("evadb.executor.create_udf_executor.load_udf_class_from_file") + @patch("evadb.executor.create_function_executor.load_function_class_from_file") def test_should_raise_error_on_incorrect_io_definition( - self, load_udf_class_from_file_mock + self, load_function_class_from_file_mock ): catalog_instance = MagicMock() - catalog_instance().get_udf_catalog_entry_by_name.return_value = None - catalog_instance().insert_udf_catalog_entry.return_value = "udf" + catalog_instance().get_function_catalog_entry_by_name.return_value = None + catalog_instance().insert_function_catalog_entry.return_value = "function" impl_path = MagicMock() abs_path = impl_path.absolute.return_value = MagicMock() abs_path.as_posix.return_value = "test.py" - load_udf_class_from_file_mock.return_value.return_value = "mock_class" + load_function_class_from_file_mock.return_value.return_value = "mock_class" incorrect_input_definition = PandasDataframe( columns=["Frame_Array", "Frame_Array_2"], column_types=[NdArrayType.UINT8], column_shapes=[(3, 256, 256), (3, 256, 256)], ) - load_udf_class_from_file_mock.return_value.forward.tags = { + load_function_class_from_file_mock.return_value.forward.tags = { "input": [incorrect_input_definition], "output": [], } plan = type( - "CreateUDFPlan", + "CreateFunctionPlan", (), { - "name": "udf", + "name": "function", "if_not_exists": False, "inputs": [], "outputs": [], "impl_path": impl_path, - "udf_type": "classification", + "function_type": "classification", }, ) evadb = MagicMock evadb.catalog = catalog_instance evadb.config = MagicMock() - create_udf_executor = CreateUDFExecutor(evadb, plan) + create_function_executor = CreateFunctionExecutor(evadb, plan) # check a string in the error message with self.assertRaises(RuntimeError) as exc: - next(create_udf_executor.exec()) + next(create_function_executor.exec()) self.assertIn( - "Error creating UDF, input/output definition incorrect:", str(exc.exception) + "Error creating Function, input/output definition incorrect:", + str(exc.exception), ) - catalog_instance().insert_udf_catalog_entry.assert_not_called() + catalog_instance().insert_function_catalog_entry.assert_not_called() diff --git a/test/unit_tests/executor/test_plan_executor.py b/test/unit_tests/executor/test_plan_executor.py index 50b36809aa..b6f749ae8a 100644 --- a/test/unit_tests/executor/test_plan_executor.py +++ b/test/unit_tests/executor/test_plan_executor.py @@ -20,7 +20,7 @@ from evadb.catalog.catalog_type import TableType from evadb.catalog.models.table_catalog import TableCatalogEntry from evadb.executor.create_executor import CreateExecutor -from evadb.executor.create_udf_executor import CreateUDFExecutor +from evadb.executor.create_function_executor import CreateFunctionExecutor from evadb.executor.drop_object_executor import DropObjectExecutor, DropObjectPlan from evadb.executor.insert_executor import InsertExecutor from evadb.executor.load_executor import LoadDataExecutor @@ -28,8 +28,8 @@ from evadb.executor.pp_executor import PPExecutor from evadb.executor.seq_scan_executor import SequentialScanExecutor from evadb.models.storage.batch import Batch +from evadb.plan_nodes.create_function_plan import CreateFunctionPlan from evadb.plan_nodes.create_plan import CreatePlan -from evadb.plan_nodes.create_udf_plan import CreateUDFPlan from evadb.plan_nodes.insert_plan import InsertPlan from evadb.plan_nodes.load_data_plan import LoadDataPlan from evadb.plan_nodes.pp_plan import PPScanPlan @@ -102,10 +102,10 @@ def test_build_execution_tree_should_create_correct_exec_node(self): executor = PlanExecutor(MagicMock(), plan)._build_execution_tree(plan) self.assertIsInstance(executor, InsertExecutor) - # CreateUDFExecutor - plan = CreateUDFPlan("test", False, [], [], MagicMock(), None) + # CreateFunctionExecutor + plan = CreateFunctionPlan("test", False, [], [], MagicMock(), None) executor = PlanExecutor(MagicMock(), plan)._build_execution_tree(plan) - self.assertIsInstance(executor, CreateUDFExecutor) + self.assertIsInstance(executor, CreateFunctionExecutor) # DropObjectExecutor plan = DropObjectPlan(MagicMock(), "test", False) @@ -178,10 +178,10 @@ def test_execute_plan_for_create_insert_load_upload_plans(self, mock_build): self.assertEqual(actual, []) - # CreateUDFExecutor + # CreateFunctionExecutor mock_build.reset_mock() - tree = MagicMock(node=CreateUDFPlan(None, False, [], [], None)) + tree = MagicMock(node=CreateFunctionPlan(None, False, [], [], None)) mock_build.return_value = tree actual = list(PlanExecutor(MagicMock(), None).execute_plan()) tree.exec.assert_called_once() diff --git a/test/unit_tests/expression/test_function_expression.py b/test/unit_tests/expression/test_function_expression.py index 803177aa0c..eb649cf409 100644 --- a/test/unit_tests/expression/test_function_expression.py +++ b/test/unit_tests/expression/test_function_expression.py @@ -19,9 +19,9 @@ from evadb.constants import NO_GPU from evadb.expression.function_expression import FunctionExpression +from evadb.functions.gpu_compatible import GPUCompatible from evadb.models.storage.batch import Batch from evadb.parser.alias import Alias -from evadb.udfs.gpu_compatible import GPUCompatible class FunctionExpressionTest(unittest.TestCase): diff --git a/test/unit_tests/udfs/__init__.py b/test/unit_tests/functions/__init__.py similarity index 100% rename from test/unit_tests/udfs/__init__.py rename to test/unit_tests/functions/__init__.py diff --git a/test/unit_tests/udfs/data/blurred_dog.jpeg b/test/unit_tests/functions/data/blurred_dog.jpeg similarity index 100% rename from test/unit_tests/udfs/data/blurred_dog.jpeg rename to test/unit_tests/functions/data/blurred_dog.jpeg diff --git a/test/unit_tests/udfs/data/dog.jpeg b/test/unit_tests/functions/data/dog.jpeg similarity index 100% rename from test/unit_tests/udfs/data/dog.jpeg rename to test/unit_tests/functions/data/dog.jpeg diff --git a/test/unit_tests/udfs/data/dog.jpeg.REMOVED.git-id b/test/unit_tests/functions/data/dog.jpeg.REMOVED.git-id similarity index 100% rename from test/unit_tests/udfs/data/dog.jpeg.REMOVED.git-id rename to test/unit_tests/functions/data/dog.jpeg.REMOVED.git-id diff --git a/test/unit_tests/udfs/data/dog_cat.jpg b/test/unit_tests/functions/data/dog_cat.jpg similarity index 100% rename from test/unit_tests/udfs/data/dog_cat.jpg rename to test/unit_tests/functions/data/dog_cat.jpg diff --git a/test/unit_tests/udfs/data/dog_cat.jpg.REMOVED.git-id b/test/unit_tests/functions/data/dog_cat.jpg.REMOVED.git-id similarity index 100% rename from test/unit_tests/udfs/data/dog_cat.jpg.REMOVED.git-id rename to test/unit_tests/functions/data/dog_cat.jpg.REMOVED.git-id diff --git a/test/unit_tests/udfs/data/grayscale_dog.jpeg b/test/unit_tests/functions/data/grayscale_dog.jpeg similarity index 100% rename from test/unit_tests/udfs/data/grayscale_dog.jpeg rename to test/unit_tests/functions/data/grayscale_dog.jpeg diff --git a/test/unit_tests/udfs/decorators/__init__.py b/test/unit_tests/functions/decorators/__init__.py similarity index 100% rename from test/unit_tests/udfs/decorators/__init__.py rename to test/unit_tests/functions/decorators/__init__.py diff --git a/test/unit_tests/udfs/decorators/io_descriptors/__init__.py b/test/unit_tests/functions/decorators/io_descriptors/__init__.py similarity index 100% rename from test/unit_tests/udfs/decorators/io_descriptors/__init__.py rename to test/unit_tests/functions/decorators/io_descriptors/__init__.py diff --git a/test/unit_tests/udfs/decorators/io_descriptors/test_descriptors.py b/test/unit_tests/functions/decorators/io_descriptors/test_descriptors.py similarity index 96% rename from test/unit_tests/udfs/decorators/io_descriptors/test_descriptors.py rename to test/unit_tests/functions/decorators/io_descriptors/test_descriptors.py index b373d1a4ad..3837e92996 100644 --- a/test/unit_tests/udfs/decorators/io_descriptors/test_descriptors.py +++ b/test/unit_tests/functions/decorators/io_descriptors/test_descriptors.py @@ -15,15 +15,15 @@ import unittest from evadb.catalog.catalog_type import ColumnType, Dimension, NdArrayType -from evadb.udfs.decorators.io_descriptors.data_types import ( +from evadb.functions.decorators.io_descriptors.data_types import ( NumpyArray, PandasDataframe, PyTorchTensor, ) -from evadb.utils.errors import UDFIODefinitionError +from evadb.utils.errors import FunctionIODefinitionError -class UDFIODescriptorsTests(unittest.TestCase): +class FunctionIODescriptorsTests(unittest.TestCase): def test_catalog_entry_for_numpy_entry(self): numpy_array = NumpyArray( name="input", is_nullable=False, type=NdArrayType.UINT8, dimensions=(2, 2) @@ -154,5 +154,5 @@ def test_raises_error_on_incorrect_pandas_definition(self): column_types=[NdArrayType.UINT8], column_shapes=[(3, 256, 256), (3, 256, 256)], ) - with self.assertRaises(UDFIODefinitionError): + with self.assertRaises(FunctionIODefinitionError): pandas_dataframe.generate_catalog_entries() diff --git a/test/unit_tests/udfs/decorators/test_decorators.py b/test/unit_tests/functions/decorators/test_decorators.py similarity index 81% rename from test/unit_tests/udfs/decorators/test_decorators.py rename to test/unit_tests/functions/decorators/test_decorators.py index 7f35239338..477d4515b0 100644 --- a/test/unit_tests/udfs/decorators/test_decorators.py +++ b/test/unit_tests/functions/decorators/test_decorators.py @@ -15,20 +15,23 @@ import unittest from evadb.catalog.catalog_type import NdArrayType -from evadb.udfs.decorators.decorators import forward, setup -from evadb.udfs.decorators.io_descriptors.data_types import NumpyArray, PandasDataframe +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import ( + NumpyArray, + PandasDataframe, +) class DecoratorTests(unittest.TestCase): def test_setup_flags_are_updated(self): - @setup(cacheable=True, udf_type="classification", batchable=True) + @setup(cacheable=True, function_type="classification", batchable=True) def setup_func(): pass setup_func() self.assertTrue(setup_func.tags["cacheable"]) self.assertTrue(setup_func.tags["batchable"]) - self.assertEqual(setup_func.tags["udf_type"], "classification") + self.assertEqual(setup_func.tags["function_type"], "classification") def test_setup_flags_are_updated_with_default_values(self): @setup() @@ -38,7 +41,7 @@ def setup_func(): setup_func() self.assertFalse(setup_func.tags["cacheable"]) self.assertTrue(setup_func.tags["batchable"]) - self.assertEqual(setup_func.tags["udf_type"], "Abstract") + self.assertEqual(setup_func.tags["function_type"], "Abstract") def test_forward_flags_are_updated(self): input_type = PandasDataframe( diff --git a/test/unit_tests/udfs/test_abstract_udf.py b/test/unit_tests/functions/test_abstract_udf.py similarity index 85% rename from test/unit_tests/udfs/test_abstract_udf.py rename to test/unit_tests/functions/test_abstract_udf.py index 72d34e3a68..306486b4eb 100644 --- a/test/unit_tests/udfs/test_abstract_udf.py +++ b/test/unit_tests/functions/test_abstract_udf.py @@ -19,21 +19,21 @@ from test.util import get_all_subclasses, get_mock_object import evadb -from evadb.udfs.abstract.abstract_udf import AbstractUDF -from evadb.udfs.abstract.hf_abstract_udf import AbstractHFUdf -from evadb.udfs.yolo_object_detector import Yolo +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.abstract.hf_abstract_function import AbstractHFFunction +from evadb.functions.yolo_object_detector import Yolo -class AbstractUDFTest(unittest.TestCase): - def test_udf_abstract_functions(self): - derived_udf_classes = list(get_all_subclasses(AbstractUDF)) - # Go over each derived class of AbstractUDF - for derived_udf_class in derived_udf_classes: +class AbstractFunctionTest(unittest.TestCase): + def test_function_abstract_functions(self): + derived_function_classes = list(get_all_subclasses(AbstractFunction)) + # Go over each derived class of AbstractFunction + for derived_function_class in derived_function_classes: # skip yolo and HF to avoid downloading model - if issubclass(derived_udf_class, (Yolo, AbstractHFUdf)): + if issubclass(derived_function_class, (Yolo, AbstractHFFunction)): continue - if isabstract(derived_udf_class) is False: - class_type = derived_udf_class + if isabstract(derived_function_class) is False: + class_type = derived_function_class # Check class init signature # Ref: https://stackoverflow.com/a/2677263 sig = inspect.signature(class_type.__init__) diff --git a/test/unit_tests/optimizer/rules/test_rules.py b/test/unit_tests/optimizer/rules/test_rules.py index 835d1b1c9c..fbfaed028a 100644 --- a/test/unit_tests/optimizer/rules/test_rules.py +++ b/test/unit_tests/optimizer/rules/test_rules.py @@ -36,10 +36,10 @@ LogicalApplyAndMergeToPhysical, LogicalApplyAndMergeToRayPhysical, LogicalCreateFromSelectToPhysical, + LogicalCreateFunctionFromSelectToPhysical, + LogicalCreateFunctionToPhysical, LogicalCreateIndexToVectorIndex, LogicalCreateToPhysical, - LogicalCreateUDFFromSelectToPhysical, - LogicalCreateUDFToPhysical, LogicalDeleteToPhysical, LogicalDerivedGetToPhysical, LogicalDropObjectToPhysical, @@ -124,8 +124,8 @@ def test_rules_promises_order(self): Promise.LOGICAL_LOAD_TO_PHYSICAL, Promise.LOGICAL_CREATE_TO_PHYSICAL, Promise.LOGICAL_CREATE_FROM_SELECT_TO_PHYSICAL, - Promise.LOGICAL_CREATE_UDF_TO_PHYSICAL, - Promise.LOGICAL_CREATE_UDF_FROM_SELECT_TO_PHYSICAL, + Promise.LOGICAL_CREATE_FUNCTION_TO_PHYSICAL, + Promise.LOGICAL_CREATE_FUNCTION_FROM_SELECT_TO_PHYSICAL, Promise.LOGICAL_SAMPLE_TO_UNIFORMSAMPLE, Promise.LOGICAL_GET_TO_SEQSCAN, Promise.LOGICAL_DERIVED_GET_TO_PHYSICAL, @@ -208,8 +208,8 @@ def test_supported_rules(self): LogicalCreateToPhysical(), LogicalCreateFromSelectToPhysical(), LogicalRenameToPhysical(), - LogicalCreateUDFToPhysical(), - LogicalCreateUDFFromSelectToPhysical(), + LogicalCreateFunctionToPhysical(), + LogicalCreateFunctionFromSelectToPhysical(), LogicalDropObjectToPhysical(), LogicalInsertToPhysical(), LogicalDeleteToPhysical(), diff --git a/test/unit_tests/optimizer/test_cascade_optimizer.py b/test/unit_tests/optimizer/test_cascade_optimizer.py index db9724bfd2..e59a6000ef 100644 --- a/test/unit_tests/optimizer/test_cascade_optimizer.py +++ b/test/unit_tests/optimizer/test_cascade_optimizer.py @@ -39,17 +39,17 @@ def tearDown(self): shutdown_ray() file_remove("dummy.avi") - def test_logical_to_physical_udf(self): + def test_logical_to_physical_function(self): load_query = f"LOAD VIDEO '{self.video_file_path}' INTO MyVideo;" execute_query_fetch_all(self.evadb, load_query) - create_udf_query = """CREATE UDF DummyObjectDetector + create_function_query = """CREATE FUNCTION DummyObjectDetector INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (label NDARRAY STR(10)) TYPE Classification IMPL 'test/util.py'; """ - execute_query_fetch_all(self.evadb, create_udf_query) + execute_query_fetch_all(self.evadb, create_function_query) select_query = """SELECT id, DummyObjectDetector(data) FROM MyVideo diff --git a/test/unit_tests/optimizer/test_optimizer_utils.py b/test/unit_tests/optimizer/test_optimizer_utils.py index 2adb823322..ebeb5e04af 100644 --- a/test/unit_tests/optimizer/test_optimizer_utils.py +++ b/test/unit_tests/optimizer/test_optimizer_utils.py @@ -15,17 +15,17 @@ import unittest from evadb.catalog.catalog_type import ColumnType, NdArrayType -from evadb.optimizer.optimizer_utils import column_definition_to_udf_io +from evadb.optimizer.optimizer_utils import column_definition_to_function_io from evadb.parser.create_statement import ColumnDefinition class OptimizerUtilsTest(unittest.TestCase): - def test_column_definition_to_udf_io(self): + def test_column_definition_to_function_io(self): col = ColumnDefinition( "data", ColumnType.NDARRAY, NdArrayType.UINT8, (None, None, None) ) col_list = [col, col] - actual = column_definition_to_udf_io(col_list, True) + actual = column_definition_to_function_io(col_list, True) for io in actual: self.assertEqual(io.name, "data") self.assertEqual(io.type, ColumnType.NDARRAY) @@ -33,10 +33,10 @@ def test_column_definition_to_udf_io(self): self.assertEqual(io.array_type, NdArrayType.UINT8) self.assertEqual(io.array_dimensions, (None, None, None)) self.assertEqual(io.is_input, True) - self.assertEqual(io.udf_id, None) + self.assertEqual(io.function_id, None) # input not list - actual2 = column_definition_to_udf_io(col, True) + actual2 = column_definition_to_function_io(col, True) for io in actual2: self.assertEqual(io.name, "data") self.assertEqual(io.type, ColumnType.NDARRAY) @@ -44,4 +44,4 @@ def test_column_definition_to_udf_io(self): self.assertEqual(io.array_type, NdArrayType.UINT8) self.assertEqual(io.array_dimensions, (None, None, None)) self.assertEqual(io.is_input, True) - self.assertEqual(io.udf_id, None) + self.assertEqual(io.function_id, None) diff --git a/test/unit_tests/optimizer/test_statement_to_opr_converter.py b/test/unit_tests/optimizer/test_statement_to_opr_converter.py index fc9e4a594d..9a3f2618d9 100644 --- a/test/unit_tests/optimizer/test_statement_to_opr_converter.py +++ b/test/unit_tests/optimizer/test_statement_to_opr_converter.py @@ -22,8 +22,8 @@ Dummy, LogicalApplyAndMerge, LogicalCreate, + LogicalCreateFunction, LogicalCreateIndex, - LogicalCreateUDF, LogicalDelete, LogicalDropObject, LogicalExchange, @@ -48,9 +48,9 @@ Operator, ) from evadb.optimizer.statement_to_opr_converter import StatementToPlanConverter +from evadb.parser.create_function_statement import CreateFunctionStatement from evadb.parser.create_index_statement import CreateIndexStatement from evadb.parser.create_statement import CreateTableStatement -from evadb.parser.create_udf_statement import CreateUDFStatement from evadb.parser.drop_object_statement import DropObjectStatement from evadb.parser.explain_statement import ExplainStatement from evadb.parser.insert_statement import InsertTableStatement @@ -120,16 +120,18 @@ def test_visit_select_should_not_call_visits_for_null_values(self): converter._visit_projection.assert_not_called() converter._visit_select_predicate.assert_not_called() - @patch("evadb.optimizer.statement_to_opr_converter.LogicalCreateUDF") + @patch("evadb.optimizer.statement_to_opr_converter.LogicalCreateFunction") @patch( "evadb.optimizer.\ -statement_to_opr_converter.column_definition_to_udf_io" +statement_to_opr_converter.column_definition_to_function_io" ) @patch( "evadb.optimizer.\ -statement_to_opr_converter.metadata_definition_to_udf_metadata" +statement_to_opr_converter.metadata_definition_to_function_metadata" ) - def test_visit_create_udf(self, metadata_def_mock, col_def_mock, l_create_udf_mock): + def test_visit_create_function( + self, metadata_def_mock, col_def_mock, l_create_function_mock + ): converter = StatementToPlanConverter() stmt = MagicMock() stmt.name = "name" @@ -137,31 +139,31 @@ def test_visit_create_udf(self, metadata_def_mock, col_def_mock, l_create_udf_mo stmt.inputs = ["inp"] stmt.outputs = ["out"] stmt.impl_path = "tmp.py" - stmt.udf_type = "classification" + stmt.function_type = "classification" stmt.query = None stmt.metadata = [("key1", "value1"), ("key2", "value2")] col_def_mock.side_effect = ["inp", "out"] metadata_def_mock.side_effect = [{"key1": "value1", "key2": "value2"}] - converter.visit_create_udf(stmt) + converter.visit_create_function(stmt) col_def_mock.assert_any_call(stmt.inputs, True) col_def_mock.assert_any_call(stmt.outputs, False) metadata_def_mock.assert_any_call(stmt.metadata) - l_create_udf_mock.assert_called_once() - l_create_udf_mock.assert_called_with( + l_create_function_mock.assert_called_once() + l_create_function_mock.assert_called_with( stmt.name, stmt.if_not_exists, "inp", "out", stmt.impl_path, - stmt.udf_type, + stmt.function_type, {"key1": "value1", "key2": "value2"}, ) - def test_visit_should_call_create_udf(self): - stmt = MagicMock(spec=CreateUDFStatement) + def test_visit_should_call_create_function(self): + stmt = MagicMock(spec=CreateFunctionStatement) converter = StatementToPlanConverter() mock = MagicMock() - converter.visit_create_udf = mock + converter.visit_create_function = mock converter.visit(stmt) mock.assert_called_once() @@ -245,7 +247,7 @@ def test_check_plan_equality(self): plans = [] dummy_plan = Dummy(MagicMock(), MagicMock()) create_plan = LogicalCreate(MagicMock(), MagicMock()) - create_udf_plan = LogicalCreateUDF( + create_function_plan = LogicalCreateFunction( MagicMock(), MagicMock(), MagicMock(), MagicMock(), MagicMock(), MagicMock() ) create_index_plan = LogicalCreateIndex( @@ -280,11 +282,11 @@ def test_check_plan_equality(self): extract_object_plan = LogicalExtractObject( MagicMock(), MagicMock(), MagicMock(), MagicMock() ) - create_plan.append_child(create_udf_plan) + create_plan.append_child(create_function_plan) plans.append(dummy_plan) plans.append(create_plan) - plans.append(create_udf_plan) + plans.append(create_function_plan) plans.append(create_index_plan) plans.append(delete_plan) plans.append(insert_plan) diff --git a/test/unit_tests/parser/test_parser.py b/test/unit_tests/parser/test_parser.py index e50748e51d..339abe7e84 100644 --- a/test/unit_tests/parser/test_parser.py +++ b/test/unit_tests/parser/test_parser.py @@ -22,13 +22,13 @@ from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression from evadb.parser.alias import Alias +from evadb.parser.create_function_statement import CreateFunctionStatement from evadb.parser.create_index_statement import CreateIndexStatement from evadb.parser.create_statement import ( ColConstraintInfo, ColumnDefinition, CreateTableStatement, ) -from evadb.parser.create_udf_statement import CreateUDFStatement from evadb.parser.delete_statement import DeleteTableStatement from evadb.parser.drop_object_statement import DropObjectStatement from evadb.parser.insert_statement import InsertTableStatement @@ -115,7 +115,7 @@ def test_create_index_statement(self): actual_stmt = evadb_stmt_list[0] self.assertEqual(actual_stmt, expected_stmt) - # create index on UDF expression + # create index on Function expression create_index_query = ( "CREATE INDEX testindex ON MyVideo (FeatureExtractor(featCol)) USING FAISS;" ) @@ -303,13 +303,13 @@ def test_drop_table_statement(self): drop_stmt = evadb_statement_list[0] self.assertEqual(drop_stmt, expected_stmt) - def test_drop_udf_statement_str(self): - drop_udf_query1 = """DROP UDF MyUDF;""" - drop_udf_query2 = """DROP UDF IF EXISTS MyUDF;""" - expected_stmt1 = DropObjectStatement(ObjectType.UDF, "MyUDF", False) - expected_stmt2 = DropObjectStatement(ObjectType.UDF, "MyUDF", True) - self.assertEqual(str(expected_stmt1), drop_udf_query1) - self.assertEqual(str(expected_stmt2), drop_udf_query2) + def test_drop_function_statement_str(self): + drop_func_query1 = """DROP FUNCTION MyFunc;""" + drop_func_query2 = """DROP FUNCTION IF EXISTS MyFunc;""" + expected_stmt1 = DropObjectStatement(ObjectType.FUNCTION, "MyFunc", False) + expected_stmt2 = DropObjectStatement(ObjectType.FUNCTION, "MyFunc", True) + self.assertEqual(str(expected_stmt1), drop_func_query1) + self.assertEqual(str(expected_stmt2), drop_func_query2) def test_single_statement_queries(self): parser = Parser() @@ -567,10 +567,10 @@ def test_select_statement_sample_class(self): # sample_freq self.assertEqual(select_stmt.from_table.sample_freq, ConstantValueExpression(5)) - def test_select_udf_star(self): + def test_select_function_star(self): parser = Parser() - query = "SELECT DemoUDF(*) FROM DemoDB.DemoTable" + query = "SELECT DemoFunc(*) FROM DemoDB.DemoTable" evadb_stmt_list = parser.parse(query) # check stmt itself @@ -655,19 +655,19 @@ def test_delete_statement(self): self.assertEqual(delete_stmt, expected_stmt) - def test_create_udf_statement(self): + def test_create_function_statement(self): parser = Parser() - create_udf_query = """CREATE UDF IF NOT EXISTS FastRCNN + create_func_query = """CREATE FUNCTION IF NOT EXISTS FastRCNN INPUT (Frame_Array NDARRAY UINT8(3, 256, 256)) OUTPUT (Labels NDARRAY STR(10), Bbox NDARRAY UINT8(10, 4)) TYPE Classification IMPL 'data/fastrcnn.py' - "KEY" "VALUE"; + PREDICT "VALUE"; """ expected_cci = ColConstraintInfo() expected_cci.nullable = True - expected_stmt = CreateUDFStatement( + expected_stmt = CreateFunctionStatement( "FastRCNN", True, Path("data/fastrcnn.py"), @@ -690,17 +690,19 @@ def test_create_udf_statement(self): ], "Classification", None, - [("KEY", "VALUE")], + [("predict", "VALUE")], ) - evadb_statement_list = parser.parse(create_udf_query) + evadb_statement_list = parser.parse(create_func_query) self.assertIsInstance(evadb_statement_list, list) self.assertEqual(len(evadb_statement_list), 1) - self.assertEqual(evadb_statement_list[0].stmt_type, StatementType.CREATE_UDF) + self.assertEqual( + evadb_statement_list[0].stmt_type, StatementType.CREATE_FUNCTION + ) self.assertEqual(str(evadb_statement_list[0]), str(expected_stmt)) - create_udf_stmt = evadb_statement_list[0] + create_func_stmt = evadb_statement_list[0] - self.assertEqual(create_udf_stmt, expected_stmt) + self.assertEqual(create_func_stmt, expected_stmt) def test_load_video_data_statement(self): parser = Parser() @@ -789,8 +791,8 @@ def test_should_return_false_for_unequal_expression(self): table, Path("data/video.mp4"), FileFormatType.VIDEO ) insert_stmt = InsertTableStatement(table) - create_udf = CreateUDFStatement( - "udf", + create_func = CreateFunctionStatement( + "func", False, Path("data/fastrcnn.py"), [ @@ -807,8 +809,8 @@ def test_should_return_false_for_unequal_expression(self): select_stmt = SelectStatement() self.assertNotEqual(load_stmt, insert_stmt) self.assertNotEqual(insert_stmt, load_stmt) - self.assertNotEqual(create_udf, insert_stmt) - self.assertNotEqual(select_stmt, create_udf) + self.assertNotEqual(create_func, insert_stmt) + self.assertNotEqual(select_stmt, create_func) def test_create_table_from_select(self): select_query = """SELECT id, Yolo(frame).labels FROM MyVideo @@ -949,12 +951,12 @@ def test_class_equality(self): self.assertNotEqual(table_ref, table_info) def test_lark(self): - query = """CREATE UDF FaceDetector + query = """CREATE FUNCTION FaceDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE FaceDetection - IMPL 'evadb/udfs/face_detector.py'; + IMPL 'evadb/functions/face_detector.py'; """ parser = Parser() parser.parse(query) diff --git a/test/unit_tests/parser/test_parser_statements.py b/test/unit_tests/parser/test_parser_statements.py index 83249759b3..dd309ebbf8 100644 --- a/test/unit_tests/parser/test_parser_statements.py +++ b/test/unit_tests/parser/test_parser_statements.py @@ -35,7 +35,7 @@ def test_parser_statement_types(self): "RENAME TABLE student TO student_info", "DROP TABLE IF EXISTS student_info", "DROP TABLE student_info", - "DROP UDF FastRCNN;", + "DROP FUNCTION FastRCNN;", "SELECT MIN(id), MAX(id), SUM(id) FROM ABC", "SELECT CLASS FROM TAIPAI \ WHERE (CLASS = 'VAN' AND REDNESS < 300) OR REDNESS > 500;", @@ -71,15 +71,15 @@ def test_parser_statement_types(self): ON table3.a = table1.a WHERE table1.a <= 5""", """SELECT frame FROM MyVideo JOIN LATERAL ObjectDet(frame) AS OD;""", - """CREATE UDF FaceDetector + """CREATE FUNCTION FaceDetector INPUT (frame NDARRAY UINT8(3, ANYDIM, ANYDIM)) OUTPUT (bboxes NDARRAY FLOAT32(ANYDIM, 4), scores NDARRAY FLOAT32(ANYDIM)) TYPE FaceDetection - IMPL 'evadb/udfs/face_detector.py'; + IMPL 'evadb/functions/face_detector.py'; """, "SHOW TABLES;", - "SHOW UDFS;", + "SHOW FUNCTIONS;", "EXPLAIN SELECT a FROM foo;", """SELECT data FROM MyVideo WHERE id < 5 ORDER BY Similarity(FeatureExtractor(Open("abc.jpg")), @@ -99,7 +99,7 @@ def test_parser_statement_types(self): "Rename Table STUDENT to student_info", "drop table if exists Student_info", "drop table Student_Info", - "Drop udf FASTRCNN;", + "Drop function FASTRCNN;", "Select min(id), max(Id), Sum(Id) from ABC", "select CLASS from Taipai where (Class = 'VAN' and REDNESS < 300) or Redness > 500;", "select class, REDNESS from TAIPAI Union all select Class, redness from Shanghai;", @@ -131,12 +131,12 @@ def test_parser_statement_types(self): On Table3.A = Table1.A where Table1.a <= 5""", """Select Frame from MyVideo Join Lateral ObjectDet(Frame) as OD;""", - """Create UDF FaceDetector + """Create FUNCTION FaceDetector Input (Frame ndArray uint8(3, anydim, anydim)) Output (bboxes ndArray float32(anydim, 4), scores ndArray float32(ANYdim)) Type FaceDetection - Impl 'evadb/udfs/face_detector.py'; + Impl 'evadb/functions/face_detector.py'; """, """CREATE DATABASE example_db WITH ENGINE = "postgres", diff --git a/test/unit_tests/plan_nodes/test_plan.py b/test/unit_tests/plan_nodes/test_plan.py index 23f4cfd6fb..daa6ce01a4 100644 --- a/test/unit_tests/plan_nodes/test_plan.py +++ b/test/unit_tests/plan_nodes/test_plan.py @@ -23,8 +23,8 @@ from evadb.parser.table_ref import TableInfo, TableRef from evadb.parser.types import FileFormatType, ObjectType from evadb.plan_nodes.abstract_plan import AbstractPlan +from evadb.plan_nodes.create_function_plan import CreateFunctionPlan from evadb.plan_nodes.create_plan import CreatePlan -from evadb.plan_nodes.create_udf_plan import CreateUDFPlan from evadb.plan_nodes.drop_object_plan import DropObjectPlan from evadb.plan_nodes.insert_plan import InsertPlan from evadb.plan_nodes.load_data_plan import LoadDataPlan @@ -70,27 +70,29 @@ def test_insert_plan(self): dummy_plan_node = InsertPlan(video_id, column_ids, values) self.assertEqual(dummy_plan_node.opr_type, PlanOprType.INSERT) - def test_create_udf_plan(self): - udf_name = "udf" + def test_create_function_plan(self): + function_name = "function" if_not_exists = True - udfIO = "udfIO" - inputs = [udfIO, udfIO] - outputs = [udfIO] + functionIO = "functionIO" + inputs = [functionIO, functionIO] + outputs = [functionIO] impl_path = "test" ty = "classification" - node = CreateUDFPlan(udf_name, if_not_exists, inputs, outputs, impl_path, ty) - self.assertEqual(node.opr_type, PlanOprType.CREATE_UDF) + node = CreateFunctionPlan( + function_name, if_not_exists, inputs, outputs, impl_path, ty + ) + self.assertEqual(node.opr_type, PlanOprType.CREATE_FUNCTION) self.assertEqual(node.if_not_exists, True) - self.assertEqual(node.inputs, [udfIO, udfIO]) - self.assertEqual(node.outputs, [udfIO]) + self.assertEqual(node.inputs, [functionIO, functionIO]) + self.assertEqual(node.outputs, [functionIO]) self.assertEqual(node.impl_path, impl_path) - self.assertEqual(node.udf_type, ty) + self.assertEqual(node.function_type, ty) def test_drop_object_plan(self): object_type = ObjectType.TABLE - udf_name = "udf" + function_name = "function" if_exists = True - node = DropObjectPlan(object_type, udf_name, if_exists) + node = DropObjectPlan(object_type, function_name, if_exists) self.assertEqual(node.opr_type, PlanOprType.DROP_OBJECT) self.assertEqual(node.if_exists, True) self.assertEqual(node.object_type, ObjectType.TABLE) diff --git a/test/unit_tests/server/test_interpreter.py b/test/unit_tests/server/test_interpreter.py index 8efab91379..3a9938988c 100644 --- a/test/unit_tests/server/test_interpreter.py +++ b/test/unit_tests/server/test_interpreter.py @@ -42,13 +42,13 @@ async def test_start_cmd_client( server_reader = asyncio.StreamReader() server_writer = MagicMock() - server_reader.feed_data(b"SHOW UDFS;\n") + server_reader.feed_data(b"SHOW FUNCTIONS;\n") server_reader.feed_data(b"EXIT;\n") mock_open.return_value = (server_reader, server_writer) stdin_reader = asyncio.StreamReader() - stdin_reader.feed_data(b"SHOW UDFS;\n") + stdin_reader.feed_data(b"SHOW FUNCTIONS;\n") stdin_reader.feed_data(b"EXIT;\n") stdin_reader.feed_eof() diff --git a/test/unit_tests/server/test_server.py b/test/unit_tests/server/test_server.py index cb1fa84910..f0bc6f7871 100644 --- a/test/unit_tests/server/test_server.py +++ b/test/unit_tests/server/test_server.py @@ -43,7 +43,7 @@ async def test_server_functions(self, mock_start): client_writer2 = MagicMock() # first client - client_reader1.feed_data(b"SHOW UDFS;\n") + client_reader1.feed_data(b"SHOW FUNCTIONS;\n") client_reader1.feed_data(b"EXIT;\n") await evadb_server.accept_client(client_reader1, client_writer1) diff --git a/test/unit_tests/storage/test_mysql_native_storage_engine.py b/test/unit_tests/storage/test_mysql_native_storage_engine.py new file mode 100644 index 0000000000..139a705303 --- /dev/null +++ b/test/unit_tests/storage/test_mysql_native_storage_engine.py @@ -0,0 +1,145 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import unittest +from test.util import get_evadb_for_testing +from unittest.mock import MagicMock, patch + +import pytest + +from evadb.catalog.models.utils import DatabaseCatalogEntry +from evadb.server.command_handler import execute_query_fetch_all + + +class NativeQueryResponse: + def __init__(self): + self.error = None + self.data = None + + +@pytest.mark.notparallel +class MySQLNativeStorageEngineTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_mysql_params(self): + return { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "3306", + "database": "evadb", + } + + def setUp(self): + connection_params = self.get_mysql_params() + self.evadb = get_evadb_for_testing() + + sys.modules["mysql"] = MagicMock() + sys.modules["mysql.connector"] = MagicMock() + + # Create all class level patches + self.get_database_catalog_entry_patcher = patch( + "evadb.catalog.catalog_manager.CatalogManager.get_database_catalog_entry" + ) + self.get_database_catalog_entry_mock = ( + self.get_database_catalog_entry_patcher.start() + ) + + self.execute_native_query_patcher = patch( + "evadb.third_party.databases.mysql.mysql_handler.MysqlHandler.execute_native_query" + ) + self.execute_native_query_mock = self.execute_native_query_patcher.start() + + self.connect_patcher = patch( + "evadb.third_party.databases.mysql.mysql_handler.MysqlHandler.connect" + ) + self.connect_mock = self.connect_patcher.start() + + self.disconnect_patcher = patch( + "evadb.third_party.databases.mysql.mysql_handler.MysqlHandler.disconnect" + ) + self.disconnect_mock = self.disconnect_patcher.start() + + # set return values + self.execute_native_query_mock.return_value = NativeQueryResponse() + self.get_database_catalog_entry_mock.return_value = DatabaseCatalogEntry( + name="test_data_source", engine="mysql", params=connection_params, row_id=1 + ) + + def tearDown(self): + self.get_database_catalog_entry_patcher.stop() + self.execute_native_query_patcher.stop() + self.connect_patcher.stop() + self.disconnect_patcher.stop() + + def test_execute_mysql_select_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + SELECT * FROM test_table + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_mysql_insert_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + INSERT INTO test_table ( + name, age, comment + ) VALUES ( + 'val', 5, 'testing' + ) + }""", + ) + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_mysql_update_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + UPDATE test_table + SET comment = 'update' + WHERE age > 5 + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_mysql_delete_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + DELETE FROM test_table + WHERE age < 5 + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() diff --git a/test/unit_tests/storage/test_postgres_native_storage_engine.py b/test/unit_tests/storage/test_postgres_native_storage_engine.py new file mode 100644 index 0000000000..985761562c --- /dev/null +++ b/test/unit_tests/storage/test_postgres_native_storage_engine.py @@ -0,0 +1,146 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import unittest +from test.util import get_evadb_for_testing +from unittest.mock import MagicMock, patch + +import pytest + +from evadb.catalog.models.utils import DatabaseCatalogEntry +from evadb.server.command_handler import execute_query_fetch_all + + +class NativeQueryResponse: + def __init__(self): + self.error = None + self.data = None + + +@pytest.mark.notparallel +class PostgresNativeStorageEngineTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_postgres_params(self): + return { + "user": "eva", + "password": "password", + "host": "localhost", + "port": "5432", + "database": "evadb", + } + + def setUp(self): + connection_params = self.get_postgres_params() + self.evadb = get_evadb_for_testing() + + sys.modules["psycopg2"] = MagicMock() + + self.get_database_catalog_entry_patcher = patch( + "evadb.catalog.catalog_manager.CatalogManager.get_database_catalog_entry" + ) + self.get_database_catalog_entry_mock = ( + self.get_database_catalog_entry_patcher.start() + ) + + self.execute_native_query_patcher = patch( + "evadb.third_party.databases.postgres.postgres_handler.PostgresHandler.execute_native_query" + ) + self.execute_native_query_mock = self.execute_native_query_patcher.start() + + self.connect_patcher = patch( + "evadb.third_party.databases.postgres.postgres_handler.PostgresHandler.connect" + ) + self.connect_mock = self.connect_patcher.start() + + self.disconnect_patcher = patch( + "evadb.third_party.databases.postgres.postgres_handler.PostgresHandler.disconnect" + ) + self.disconnect_mock = self.disconnect_patcher.start() + + # set return values + self.execute_native_query_mock.return_value = NativeQueryResponse() + self.get_database_catalog_entry_mock.return_value = DatabaseCatalogEntry( + name="test_data_source", + engine="postgres", + params=connection_params, + row_id=1, + ) + + def tearDown(self): + self.get_database_catalog_entry_patcher.stop() + self.execute_native_query_patcher.stop() + self.connect_patcher.stop() + self.disconnect_patcher.stop() + + def test_execute_postgres_select_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + SELECT * FROM test_table + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_postgres_insert_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + INSERT INTO test_table ( + name, age, comment + ) VALUES ( + 'val', 5, 'testing' + ) + }""", + ) + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_postgres_update_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + UPDATE test_table + SET comment = 'update' + WHERE age > 5 + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_postgres_delete_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + DELETE FROM test_table + WHERE age < 5 + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() diff --git a/test/unit_tests/storage/test_sqlite_native_storage_engine.py b/test/unit_tests/storage/test_sqlite_native_storage_engine.py new file mode 100644 index 0000000000..ee7e90bb25 --- /dev/null +++ b/test/unit_tests/storage/test_sqlite_native_storage_engine.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test.util import get_evadb_for_testing +from unittest.mock import patch + +import pytest + +from evadb.catalog.models.utils import DatabaseCatalogEntry +from evadb.server.command_handler import execute_query_fetch_all + + +class NativeQueryResponse: + def __init__(self): + self.error = None + self.data = None + + +@pytest.mark.notparallel +class SQLiteNativeStorageEngineTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_sqlite_params(self): + return { + "database": "evadb.db", + } + + def setUp(self): + connection_params = self.get_sqlite_params() + self.evadb = get_evadb_for_testing() + + # Create all class level patches + self.get_database_catalog_entry_patcher = patch( + "evadb.catalog.catalog_manager.CatalogManager.get_database_catalog_entry" + ) + self.get_database_catalog_entry_mock = ( + self.get_database_catalog_entry_patcher.start() + ) + + self.execute_native_query_patcher = patch( + "evadb.third_party.databases.sqlite.sqlite_handler.SQLiteHandler.execute_native_query" + ) + self.execute_native_query_mock = self.execute_native_query_patcher.start() + + self.connect_patcher = patch( + "evadb.third_party.databases.sqlite.sqlite_handler.SQLiteHandler.connect" + ) + self.connect_mock = self.connect_patcher.start() + + self.disconnect_patcher = patch( + "evadb.third_party.databases.sqlite.sqlite_handler.SQLiteHandler.disconnect" + ) + self.disconnect_mock = self.disconnect_patcher.start() + + # set return values + self.execute_native_query_mock.return_value = NativeQueryResponse() + self.get_database_catalog_entry_mock.return_value = DatabaseCatalogEntry( + name="test_data_source", engine="sqlite", params=connection_params, row_id=1 + ) + + def tearDown(self): + self.get_database_catalog_entry_patcher.stop() + self.execute_native_query_patcher.stop() + self.connect_patcher.stop() + self.disconnect_patcher.stop() + + def test_execute_sqlite_select_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + SELECT * FROM test_table + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_sqlite_insert_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + INSERT INTO test_table ( + name, age, comment + ) VALUES ( + 'val', 5, 'testing' + ) + }""", + ) + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_sqlite_update_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + UPDATE test_table + SET comment = 'update' + WHERE age > 5 + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() + + def test_execute_sqlite_delete_query(self): + execute_query_fetch_all( + self.evadb, + """USE test_data_source { + DELETE FROM test_table + WHERE age < 5 + }""", + ) + + self.connect_mock.assert_called_once() + self.execute_native_query_mock.assert_called_once() + self.get_database_catalog_entry_mock.assert_called_once() + self.disconnect_mock.assert_called_once() diff --git a/test/util.py b/test/util.py index 199c064cf7..39be10449c 100644 --- a/test/util.py +++ b/test/util.py @@ -33,6 +33,13 @@ from evadb.configuration.constants import EvaDB_DATABASE_DIR, EvaDB_INSTALLATION_DIR from evadb.database import init_evadb_instance from evadb.expression.function_expression import FunctionExpression +from evadb.functions.abstract.abstract_function import AbstractClassifierFunction +from evadb.functions.decorators import decorators +from evadb.functions.decorators.io_descriptors.data_types import ( + NumpyArray, + PandasDataframe, +) +from evadb.functions.function_bootstrap_queries import init_builtin_functions from evadb.models.storage.batch import Batch from evadb.optimizer.operators import LogicalFilter, Operator from evadb.optimizer.plan_generator import PlanGenerator @@ -40,10 +47,6 @@ from evadb.parser.parser import Parser from evadb.plan_nodes.abstract_plan import AbstractPlan from evadb.server.command_handler import execute_query_fetch_all -from evadb.udfs.abstract.abstract_udf import AbstractClassifierUDF -from evadb.udfs.decorators import decorators -from evadb.udfs.decorators.io_descriptors.data_types import NumpyArray, PandasDataframe -from evadb.udfs.udf_bootstrap_queries import init_builtin_udfs from evadb.utils.generic_utils import ( is_ray_available, remove_directory_contents, @@ -233,14 +236,14 @@ def get_physical_query_plan( return p_plan -def remove_udf_cache(db, query): +def remove_function_cache(db, query): plan = next(get_logical_query_plan(db, query).find_all(LogicalFilter)) func_exprs = plan.predicate.find_all(FunctionExpression) for expr in func_exprs: cache_name = expr.signature() - udf_cache = db.catalog.get_udf_cache_catalog_entry_by_name(cache_name) - if udf_cache is not None: - cache_dir = Path(udf_cache.cache_path) + function_cache = db.catalog.get_function_cache_catalog_entry_by_name(cache_name) + if function_cache is not None: + cache_dir = Path(function_cache.cache_path) if cache_dir.exists(): shutil.rmtree(cache_dir) @@ -505,12 +508,12 @@ def create_dummy_4d_batches( yield Batch(df) -def load_udfs_for_testing(db, mode="debug"): - # DEBUG MODE: ALL UDFs - init_builtin_udfs(db, mode=mode) +def load_functions_for_testing(db, mode="debug"): + # DEBUG MODE: ALL Functions + init_builtin_functions(db, mode=mode) -class DummyObjectDetector(AbstractClassifierUDF): +class DummyObjectDetector(AbstractClassifierFunction): def setup(self, *args, **kwargs): pass @@ -534,7 +537,7 @@ def classify_one(self, frames: np.ndarray): return np.array([label]) -class DummyMultiObjectDetector(AbstractClassifierUDF): +class DummyMultiObjectDetector(AbstractClassifierFunction): """ Returns multiple objects for each frame """ @@ -561,7 +564,7 @@ def classify_one(self, frames: np.ndarray): return np.array([label, label]) -class DummyFeatureExtractor(AbstractClassifierUDF): +class DummyFeatureExtractor(AbstractClassifierFunction): """ Returns a feature for a frame. """ @@ -591,8 +594,8 @@ def _extract_feature(row: pd.Series): return ret -class DummyObjectDetectorDecorators(AbstractClassifierUDF): - @decorators.setup(cacheable=True, udf_type="object_detection", batchable=True) +class DummyObjectDetectorDecorators(AbstractClassifierFunction): + @decorators.setup(cacheable=True, function_type="object_detection", batchable=True) def setup(self, *args, **kwargs): pass diff --git a/tutorials/11-similarity-search-for-motif-mining.ipynb b/tutorials/11-similarity-search-for-motif-mining.ipynb index d31af357c9..2a27efa7ef 100644 --- a/tutorials/11-similarity-search-for-motif-mining.ipynb +++ b/tutorials/11-similarity-search-for-motif-mining.ipynb @@ -294,7 +294,7 @@ "source": [ "cursor.query(\"DROP UDF IF EXISTS SiftFeatureExtractor;\").df()\n", "cursor.query(\"\"\"CREATE UDF IF NOT EXISTS SiftFeatureExtractor\n", - " IMPL '../evadb/udfs/sift_feature_extractor.py'\"\"\").df()" + " IMPL '../evadb/functions/sift_feature_extractor.py'\"\"\").df()" ] }, { diff --git a/tutorials/14-food-review-tone-analysis-and-response.ipynb b/tutorials/14-food-review-tone-analysis-and-response.ipynb new file mode 100644 index 0000000000..2c659fd605 --- /dev/null +++ b/tutorials/14-food-review-tone-analysis-and-response.ipynb @@ -0,0 +1,1634 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "nQ7xbeAhW56k" + }, + "source": [ + "# Food Review Tone Analysis and Response\n", + "In this tutorial, we use EvaDB + ChatGPT to analyze whether a food review is negative or not. Based on the analysis, we then use EvaDB + ChatGPT again to form a polite response to address negative review." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + "
\n", + " Run on Google Colab\n", + " \n", + " View source on GitHub\n", + " \n", + " Download notebook\n", + "


" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u4duS22nW56m" + }, + "source": [ + "## Start Postgres" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iwWMlVaVaOuH", + "outputId": "41d6f054-4257-4d54-dba0-ffc7098a8c6d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "The following additional packages will be installed:\n", + " libcommon-sense-perl libjson-perl libjson-xs-perl libtypes-serialiser-perl\n", + " logrotate netbase postgresql-14 postgresql-client-14\n", + " postgresql-client-common postgresql-common ssl-cert sysstat\n", + "Suggested packages:\n", + " bsd-mailx | mailx postgresql-doc postgresql-doc-14 isag\n", + "The following NEW packages will be installed:\n", + " libcommon-sense-perl libjson-perl libjson-xs-perl libtypes-serialiser-perl\n", + " logrotate netbase postgresql postgresql-14 postgresql-client-14\n", + " postgresql-client-common postgresql-common ssl-cert sysstat\n", + "0 upgraded, 13 newly installed, 0 to remove and 16 not upgraded.\n", + "Need to get 18.3 MB of archives.\n", + "After this operation, 51.5 MB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 logrotate amd64 3.19.0-1ubuntu1.1 [54.3 kB]\n", + "Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 netbase all 6.3 [12.9 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libcommon-sense-perl amd64 3.75-2build1 [21.1 kB]\n", + "Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 libjson-perl all 4.04000-1 [81.8 kB]\n", + "Get:5 http://archive.ubuntu.com/ubuntu jammy/main amd64 libtypes-serialiser-perl all 1.01-1 [11.6 kB]\n", + "Get:6 http://archive.ubuntu.com/ubuntu jammy/main amd64 libjson-xs-perl amd64 4.030-1build3 [87.2 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu jammy/main amd64 postgresql-client-common all 238 [29.6 kB]\n", + "Get:8 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 postgresql-client-14 amd64 14.9-0ubuntu0.22.04.1 [1,222 kB]\n", + "Get:9 http://archive.ubuntu.com/ubuntu jammy/main amd64 ssl-cert all 1.1.2 [17.4 kB]\n", + "Get:10 http://archive.ubuntu.com/ubuntu jammy/main amd64 postgresql-common all 238 [169 kB]\n", + "Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 postgresql-14 amd64 14.9-0ubuntu0.22.04.1 [16.1 MB]\n", + "Get:12 http://archive.ubuntu.com/ubuntu jammy/main amd64 postgresql all 14+238 [3,288 B]\n", + "Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 sysstat amd64 12.5.2-2ubuntu0.2 [487 kB]\n", + "Fetched 18.3 MB in 1s (25.2 MB/s)\n", + "Preconfiguring packages ...\n", + "Selecting previously unselected package logrotate.\n", + "(Reading database ... 120831 files and directories currently installed.)\n", + "Preparing to unpack .../00-logrotate_3.19.0-1ubuntu1.1_amd64.deb ...\n", + "Unpacking logrotate (3.19.0-1ubuntu1.1) ...\n", + "Selecting previously unselected package netbase.\n", + "Preparing to unpack .../01-netbase_6.3_all.deb ...\n", + "Unpacking netbase (6.3) ...\n", + "Selecting previously unselected package libcommon-sense-perl:amd64.\n", + "Preparing to unpack .../02-libcommon-sense-perl_3.75-2build1_amd64.deb ...\n", + "Unpacking libcommon-sense-perl:amd64 (3.75-2build1) ...\n", + "Selecting previously unselected package libjson-perl.\n", + "Preparing to unpack .../03-libjson-perl_4.04000-1_all.deb ...\n", + "Unpacking libjson-perl (4.04000-1) ...\n", + "Selecting previously unselected package libtypes-serialiser-perl.\n", + "Preparing to unpack .../04-libtypes-serialiser-perl_1.01-1_all.deb ...\n", + "Unpacking libtypes-serialiser-perl (1.01-1) ...\n", + "Selecting previously unselected package libjson-xs-perl.\n", + "Preparing to unpack .../05-libjson-xs-perl_4.030-1build3_amd64.deb ...\n", + "Unpacking libjson-xs-perl (4.030-1build3) ...\n", + "Selecting previously unselected package postgresql-client-common.\n", + "Preparing to unpack .../06-postgresql-client-common_238_all.deb ...\n", + "Unpacking postgresql-client-common (238) ...\n", + "Selecting previously unselected package postgresql-client-14.\n", + "Preparing to unpack .../07-postgresql-client-14_14.9-0ubuntu0.22.04.1_amd64.deb ...\n", + "Unpacking postgresql-client-14 (14.9-0ubuntu0.22.04.1) ...\n", + "Selecting previously unselected package ssl-cert.\n", + "Preparing to unpack .../08-ssl-cert_1.1.2_all.deb ...\n", + "Unpacking ssl-cert (1.1.2) ...\n", + "Selecting previously unselected package postgresql-common.\n", + "Preparing to unpack .../09-postgresql-common_238_all.deb ...\n", + "Adding 'diversion of /usr/bin/pg_config to /usr/bin/pg_config.libpq-dev by postgresql-common'\n", + "Unpacking postgresql-common (238) ...\n", + "Selecting previously unselected package postgresql-14.\n", + "Preparing to unpack .../10-postgresql-14_14.9-0ubuntu0.22.04.1_amd64.deb ...\n", + "Unpacking postgresql-14 (14.9-0ubuntu0.22.04.1) ...\n", + "Selecting previously unselected package postgresql.\n", + "Preparing to unpack .../11-postgresql_14+238_all.deb ...\n", + "Unpacking postgresql (14+238) ...\n", + "Selecting previously unselected package sysstat.\n", + "Preparing to unpack .../12-sysstat_12.5.2-2ubuntu0.2_amd64.deb ...\n", + "Unpacking sysstat (12.5.2-2ubuntu0.2) ...\n", + "Setting up logrotate (3.19.0-1ubuntu1.1) ...\n", + "Created symlink /etc/systemd/system/timers.target.wants/logrotate.timer → /lib/systemd/system/logrotate.timer.\n", + "Setting up libcommon-sense-perl:amd64 (3.75-2build1) ...\n", + "Setting up ssl-cert (1.1.2) ...\n", + "Setting up libtypes-serialiser-perl (1.01-1) ...\n", + "Setting up libjson-perl (4.04000-1) ...\n", + "Setting up netbase (6.3) ...\n", + "Setting up sysstat (12.5.2-2ubuntu0.2) ...\n", + "\n", + "Creating config file /etc/default/sysstat with new version\n", + "update-alternatives: using /usr/bin/sar.sysstat to provide /usr/bin/sar (sar) in auto mode\n", + "Created symlink /etc/systemd/system/sysstat.service.wants/sysstat-collect.timer → /lib/systemd/system/sysstat-collect.timer.\n", + "Created symlink /etc/systemd/system/sysstat.service.wants/sysstat-summary.timer → /lib/systemd/system/sysstat-summary.timer.\n", + "Created symlink /etc/systemd/system/multi-user.target.wants/sysstat.service → /lib/systemd/system/sysstat.service.\n", + "Setting up postgresql-client-common (238) ...\n", + "Setting up libjson-xs-perl (4.030-1build3) ...\n", + "Setting up postgresql-client-14 (14.9-0ubuntu0.22.04.1) ...\n", + "update-alternatives: using /usr/share/postgresql/14/man/man1/psql.1.gz to provide /usr/share/man/man1/psql.1.gz (psql.1.gz) in auto mode\n", + "Setting up postgresql-common (238) ...\n", + "Adding user postgres to group ssl-cert\n", + "\n", + "Creating config file /etc/postgresql-common/createcluster.conf with new version\n", + "Building PostgreSQL dictionaries from installed myspell/hunspell packages...\n", + "Removing obsolete dictionary files:\n", + "Created symlink /etc/systemd/system/multi-user.target.wants/postgresql.service → /lib/systemd/system/postgresql.service.\n", + "Setting up postgresql-14 (14.9-0ubuntu0.22.04.1) ...\n", + "Creating new PostgreSQL cluster 14/main ...\n", + "/usr/lib/postgresql/14/bin/initdb -D /var/lib/postgresql/14/main --auth-local peer --auth-host scram-sha-256 --no-instructions\n", + "The files belonging to this database system will be owned by user \"postgres\".\n", + "This user must also own the server process.\n", + "\n", + "The database cluster will be initialized with locale \"en_US.UTF-8\".\n", + "The default database encoding has accordingly been set to \"UTF8\".\n", + "The default text search configuration will be set to \"english\".\n", + "\n", + "Data page checksums are disabled.\n", + "\n", + "fixing permissions on existing directory /var/lib/postgresql/14/main ... ok\n", + "creating subdirectories ... ok\n", + "selecting dynamic shared memory implementation ... posix\n", + "selecting default max_connections ... 100\n", + "selecting default shared_buffers ... 128MB\n", + "selecting default time zone ... Etc/UTC\n", + "creating configuration files ... ok\n", + "running bootstrap script ... ok\n", + "performing post-bootstrap initialization ... ok\n", + "syncing data to disk ... ok\n", + "update-alternatives: using /usr/share/postgresql/14/man/man1/postmaster.1.gz to provide /usr/share/man/man1/postmaster.1.gz (postmaster.1.gz) in auto mode\n", + "invoke-rc.d: could not determine current runlevel\n", + "invoke-rc.d: policy-rc.d denied execution of start.\n", + "Setting up postgresql (14+238) ...\n", + "Processing triggers for man-db (2.10.2-1) ...\n", + " * Starting PostgreSQL 14 database server\n", + " ...done.\n" + ] + } + ], + "source": [ + "!apt install postgresql\n", + "!service postgresql start" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mUndbObdW56m" + }, + "source": [ + "## Create User and Database" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VonZMyLaesil", + "outputId": "f7a47b41-598e-4308-c7aa-4d1ed64fc265" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CREATE ROLE\n", + "CREATE DATABASE\n" + ] + } + ], + "source": [ + "!sudo -u postgres psql -c \"CREATE USER eva WITH SUPERUSER PASSWORD 'password'\"\n", + "!sudo -u postgres psql -c \"CREATE DATABASE evadb\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OQ8JQqqaW56n" + }, + "source": [ + "## Install EvaDB" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i-pKJsODavE6", + "outputId": "72b71066-df49-47e1-8404-3d5950326c46" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: psycopg2 in /usr/local/lib/python3.10/dist-packages (2.9.7)\n" + ] + } + ], + "source": [ + "%pip install --quiet \"evadb[document]\"\n", + "%pip install psycopg2\n", + "\n", + "import evadb\n", + "cursor = evadb.connect().cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "YT8nVVKoaC_D" + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from IPython.core.display import display, HTML\n", + "def pretty_print(df):\n", + " return display(HTML( df.to_html().replace(\"\\\\n\",\"
\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nnXs1OxpW56o" + }, + "source": [ + "## Create Data Source in EvaDB\n", + "We use data source to connect EvaDB directly to underlying database systems like Postgres." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "PkbSv2gKc5Nd", + "outputId": "0d5863e4-754f-404b-9afd-c8f4530fcf91" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0The database postgres_data has been successful...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " 0\n", + "0 The database postgres_data has been successful..." + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params = {\n", + " \"user\": \"eva\",\n", + " \"password\": \"password\",\n", + " \"host\": \"localhost\",\n", + " \"port\": \"5432\",\n", + " \"database\": \"evadb\",\n", + "}\n", + "query = f\"CREATE DATABASE postgres_data WITH ENGINE = 'postgres', PARAMETERS = {params};\"\n", + "cursor.query(query).df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ew9wsQUwW56p" + }, + "source": [ + "## Create Review Table" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "tt1MB5CrtYHy", + "outputId": "90f9f1e2-fce7-4396-dfa7-81af843e8e45" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " status\n", + "0 success" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cursor.query(\"\"\"\n", + "USE postgres_data {\n", + " DROP TABLE IF EXISTS review_table\n", + "}\n", + "\"\"\").df()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "BJnQCg7RbImb", + "outputId": "2c46662c-085f-4fce-9e36-16875f3a6426" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " status\n", + "0 success" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cursor.query(\"\"\"\n", + "USE postgres_data {\n", + " CREATE TABLE review_table (name VARCHAR(10), review VARCHAR(1000))\n", + "}\n", + "\"\"\").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HvxsDeaXW56p" + }, + "source": [ + "## Insert Reviews into Postgres\n", + "In this example, we directly insert data into the table for simplicity. We can also load data from files like CSV using Postgres APIs." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "l-pYla7Jugx7", + "outputId": "fff66893-4dcf-4cb6-8fa5-6b5dc7882c42" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " status\n", + "0 success" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "USE postgres_data {\n", + " INSERT INTO review_table (name, review) VALUES ('Customer 1', 'I ordered fried rice but it is too salty.')\n", + "}\n", + "\"\"\"\n", + "cursor.query(query).df()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "tfKoI3s9fcSa", + "outputId": "04b47285-72d3-4108-c5b8-8bc21fa1da44" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " status\n", + "0 success" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "USE postgres_data {\n", + " INSERT INTO review_table (name, review) VALUES ('Customer 2', 'I ordered burger. It tastes very good and the service is exceptional.')\n", + "}\n", + "\"\"\"\n", + "cursor.query(query).df()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81 + }, + "id": "Koj8vwWnYEey", + "outputId": "476f1d7c-fd21-4f01-ec86-ef15bd64f16b" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status
0success
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " status\n", + "0 success" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + "USE postgres_data {\n", + " INSERT INTO review_table (name, review) VALUES ('Customer 3', 'I ordered a takeout order, but the chicken sandwidth is missing from the order.')\n", + "}\n", + "\"\"\"\n", + "cursor.query(query).df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zdxBe4yzW56q" + }, + "source": [ + "## Review Table Content\n", + "Now we have 3 reviews from different customers stored in the table." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "oA_CNDbxveoo", + "outputId": "ca1632b9-db2b-4694-87a1-b0c41184748c" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
review_table.namereview_table.review
0Customer 1ordered fried rice but it is too salty.
1Customer 2I ordered burger. It tastes very good and the ...
2Customer 3I ordered a takeout order, but the chicken san...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " review_table.name review_table.review\n", + "0 Customer 1 ordered fried rice but it is too salty.\n", + "1 Customer 2 I ordered burger. It tastes very good and the ...\n", + "2 Customer 3 I ordered a takeout order, but the chicken san..." + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cursor.query(\"SELECT * FROM postgres_data.review_table;\").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b0p1__K0W56u" + }, + "source": [ + "## Register OpenAI Token" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "rsUHb1_Ih2dE" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"OPENAI_KEY\"] = \"sk-...\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3bAzB5gfW56u" + }, + "source": [ + "## Tone Analysis for All Reviews\n", + "Here, we use ChatGPT with customized prompt to summarize whether the review tone is \"postive\" or \"negative\"." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "W8J_N1WPiOVj", + "outputId": "cf7b2383-6cb8-4597-c7e7-505e14dbcac7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chatgpt.response
0negative
1positive
2negative
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " chatgpt.response\n", + "0 negative\n", + "1 positive\n", + "2 negative" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cursor.query(\"\"\"\n", + "SELECT ChatGPT(\n", + " \"Is the review positive or negative. Only reply 'positive' or 'negative'. Here are examples. The food is very bad: negative. The food is very good: postive.\",\n", + " review\n", + ")\n", + "FROM postgres_data.review_table;\n", + "\"\"\").df()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n_0R8dG_W56u" + }, + "source": [ + "## Respond to Negative Reviews\n", + "EvaDB allows users to filter data based on function output. In this query, we construct a query to filter out \"postive\" reviews. We then use a second ChatGPT with customized prompt to propose a solution to address customer's negative reviews politely." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 703 + }, + "id": "QCX7gDJuiV15", + "outputId": "2ceca7cc-c992-492b-9506-c646b681c429" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chatgpt.response
0Dear valued customer,

Thank you for bringing this matter to our attention. We apologize for the inconvenience caused by the excessive saltiness of your fried rice. We understand how important it is to have a satisfying dining experience, and we would like to make it right for you.

To address your concern, we have taken the following steps:

1. Recipe adjustment: We have reviewed our fried rice recipe and made necessary adjustments to ensure that the saltiness is balanced and meets our customers' expectations.

2. Staff training: We have conducted additional training sessions with our kitchen staff to emphasize the importance of proper seasoning and taste testing before serving any dish.

3. Quality control: Our management team will be implementing stricter quality control measures to ensure that every dish leaving our kitchen meets our high standards.

We would like to invite you to give us another chance to serve you. Please reach out to our customer service team, and we will be more than happy to offer you a replacement dish or a refund for your order. We value your feedback and want to ensure that you have a positive experience with us.

Once again, we apologize for any inconvenience caused, and we appreciate your understanding. We look forward to the opportunity to make it right for you.

Best regards,
[Your Name]
[Restaurant Name]
1Dear [Customer's Name],

Thank you for bringing this issue to our attention. We apologize for the inconvenience caused by the missing chicken sandwich in your takeout order. We understand how frustrating it can be when an item is missing from your meal.

To address this concern, we would like to offer you two possible solutions:

1. Replacement: We can arrange for a new chicken sandwich to be prepared and delivered to your location as soon as possible. Please let us know your preferred time for the replacement.

2. Refund: If you prefer not to receive a replacement, we can issue a refund for the missing chicken sandwich. The refund will be processed through the same payment method used for the original order.

Please let us know which option you would prefer, and we will take immediate action to resolve this issue for you. We value your satisfaction and want to ensure that you have a positive experience with our service.

Once again, we apologize for any inconvenience caused, and we appreciate your understanding. If you have any further questions or concerns, please don't hesitate to reach out to us.

Best regards,
[Your Name]
[Restaurant Name]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response_df = cursor.query(\"\"\"\n", + "SELECT ChatGPT(\n", + " \"Respond the the review with solution to address the review's concern\",\n", + " review\n", + ")\n", + "FROM postgres_data.review_table\n", + "WHERE ChatGPT(\n", + " \"Is the review positive or negative. Only reply 'positive' or 'negative'. Here are examples. The food is very bad: negative. The food is very good: postive.\",\n", + " review\n", + ") = \"negative\";\n", + "\"\"\").df()\n", + "\n", + "pretty_print(response_df)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}