From e5a91909ab1774d531dd803697f92d5d95e3ccc7 Mon Sep 17 00:00:00 2001 From: Fikre Mengistu <70618860+fikremen@users.noreply.github.com> Date: Sun, 3 Dec 2023 04:09:14 -0500 Subject: [PATCH] feat: supporting spaces in column names for csv files (#1388) takes `reverse quote id`, removes back ticks, and converts it to `simple id`. --------- Co-authored-by: americast Co-authored-by: Andy Xu --- docs/source/overview/faq.rst | 13 ++++++ .../lark_visitor/_common_clauses_ids.py | 7 +++ .../short/test_load_executor.py | 44 +++++++++++++++++++ test/util.py | 31 +++++++++++++ 4 files changed, 95 insertions(+) diff --git a/docs/source/overview/faq.rst b/docs/source/overview/faq.rst index f2d4b61aed..b0dacfa41e 100644 --- a/docs/source/overview/faq.rst +++ b/docs/source/overview/faq.rst @@ -34,3 +34,16 @@ If a query runs a complex AI task (e.g., sentiment analysis) on a large table, t top pgrep evadb_server +Can column names have space? +---------------------------- + +For column names with space, you can use reverse quote to contain the column names. Below are example `CREATE TABLE` and `SELECT` queries: + +.. code-block:: sql + + CREATE TABLE IF NOT EXISTS MyVideoCSV ( + id INTEGER UNIQUE, + `frame id` INTEGER, + ); + + SELECT id, `frame id` FROM MyVideoCSV; diff --git a/evadb/parser/lark_visitor/_common_clauses_ids.py b/evadb/parser/lark_visitor/_common_clauses_ids.py index 4dd3080dd3..5267761cf0 100644 --- a/evadb/parser/lark_visitor/_common_clauses_ids.py +++ b/evadb/parser/lark_visitor/_common_clauses_ids.py @@ -43,6 +43,13 @@ def full_id(self, tree): return (self.visit(tree.children[0]), self.visit(tree.children[1])) def uid(self, tree): + if ( + hasattr(tree.children[0], "type") + and tree.children[0].type == "REVERSE_QUOTE_ID" + ): + tree.children[0].type = "simple_id" + non_tick_string = str(tree.children[0]).replace("`", "") + return non_tick_string return self.visit(tree.children[0]) def full_column_name(self, tree): diff --git a/test/integration_tests/short/test_load_executor.py b/test/integration_tests/short/test_load_executor.py index afc45c592a..542012211c 100644 --- a/test/integration_tests/short/test_load_executor.py +++ b/test/integration_tests/short/test_load_executor.py @@ -17,6 +17,7 @@ import unittest from pathlib import Path from test.util import ( + create_csv_with_comlumn_name_spaces, create_dummy_csv_batches, create_sample_csv, create_sample_video, @@ -117,6 +118,49 @@ def test_should_load_csv_in_table(self): drop_query = "DROP TABLE IF EXISTS MyVideoCSV;" execute_query_fetch_all(self.evadb, drop_query) + ################################### + # integration tests for csv files with spaces in column names + def test_should_load_csv_in_table_with_spaces_in_column_name(self): + # loading a csv requires a table to be created first + create_table_query = """ + + CREATE TABLE IF NOT EXISTS MyVideoCSV ( + id INTEGER UNIQUE, + `frame id` INTEGER, + `video id` INTEGER, + `dataset name` TEXT(30), + label TEXT(30), + bbox NDARRAY FLOAT32(4), + `object id` INTEGER + ); + + """ + execute_query_fetch_all(self.evadb, create_table_query) + + # load the CSV + load_query = ( + f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;" + ) + execute_query_fetch_all(self.evadb, load_query) + + # execute a select query + select_query = """SELECT id, `frame id`, `video id`, + `dataset name`, label, bbox, + `object id` + FROM MyVideoCSV;""" + + actual_batch = execute_query_fetch_all(self.evadb, select_query) + actual_batch.sort() + + # assert the batches are equal + expected_batch = next(create_dummy_csv_batches()) + expected_batch.modify_column_alias("myvideocsv") + self.assertEqual(actual_batch, expected_batch) + + # clean up + drop_query = "DROP TABLE IF EXISTS MyVideoCSV;" + execute_query_fetch_all(self.evadb, drop_query) + if __name__ == "__main__": unittest.main() diff --git a/test/util.py b/test/util.py index 3a23a6ff5c..7df662b4cb 100644 --- a/test/util.py +++ b/test/util.py @@ -319,6 +319,37 @@ def create_sample_csv(num_frames=NUM_FRAMES): return os.path.join(get_tmp_dir(), "dummy.csv") +def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES): + try: + os.remove(os.path.join(get_tmp_dir(), "dummy.csv")) + except FileNotFoundError: + pass + + sample_meta = {} + + index = 0 + sample_labels = ["car", "pedestrian", "bicycle"] + num_videos = 2 + for video_id in range(num_videos): + for frame_id in range(num_frames): + random_coords = 200 + 300 * np.random.random(4) + sample_meta[index] = { + "id": index, + "frame id": frame_id, + "video id": video_id, + "dataset name": "test_dataset", + "label": sample_labels[np.random.choice(len(sample_labels))], + "bbox": ",".join([str(coord) for coord in random_coords]), + "object id": np.random.choice(3), + } + + index += 1 + + df_sample_meta = pd.DataFrame.from_dict(sample_meta, "index") + df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False) + return os.path.join(get_tmp_dir(), "dummy.csv") + + def create_dummy_csv_batches(target_columns=None): if target_columns: df = pd.read_csv(