From c2457b2a0a6153682d398ac07ea6f18656f8d7ba Mon Sep 17 00:00:00 2001 From: anmolagarwalcp810 <42912887+anmolagarwalcp810@users.noreply.github.com> Date: Fri, 1 Dec 2023 02:16:41 -0500 Subject: [PATCH] Enhancing INSERT Command to Support Inserting Multiple Values (#1421) # Problem Statement INSERT command wasn't inserting multiple values into table. It was only inserting the first tuple and ignoring everything else. ![image](https://github.com/georgia-tech-db/evadb/assets/42912887/ed8104af-cedb-4453-88ec-1e4d7827fa02) # Solution Modified backend of INSERT command by finding out two things: 1. Place where other tuples are dropped except the first one. That happened inside `parser/lark_visitor/_insert_statements.py`. 2. Place where INSERT command is actually executed, and does it support inserting multiple tuples and it does, `executor/insert_executor.py`. After find those two things, I simply captured all the tuples coming from the tree created by Lark, and passed them to the executor through the planner. Tried to make sure that there are no issues inside planner because of that. I also modified hash functions of InsertTableStatement, InsertPlan and LogicalInsert classes because value_list member became 2-dimensional after my change, and couldn't be hashed. So I converted each element of value_list to tuple, and then hash functions were working. # Output after enhancing INSERT command ![image](https://github.com/georgia-tech-db/evadb/assets/42912887/c15cac9b-a06f-4835-9b34-b9720efaa432) --------- Co-authored-by: Anmol Agarwal Co-authored-by: americast --- evadb/executor/insert_executor.py | 9 +- evadb/optimizer/operators.py | 2 +- evadb/parser/insert_statement.py | 2 +- .../parser/lark_visitor/_insert_statements.py | 4 +- evadb/plan_nodes/insert_plan.py | 2 +- .../short/test_insert_executor.py | 106 ++++++++++++++++++ test/unit_tests/parser/test_parser.py | 6 +- 7 files changed, 119 insertions(+), 12 deletions(-) diff --git a/evadb/executor/insert_executor.py b/evadb/executor/insert_executor.py index d2dccd96a..9aac6febc 100644 --- a/evadb/executor/insert_executor.py +++ b/evadb/executor/insert_executor.py @@ -42,12 +42,13 @@ def exec(self, *args, **kwargs): table_catalog_entry.table_type == TableType.STRUCTURED_DATA ), "INSERT only implemented for structured data" - values_to_insert = [val_node.value for val_node in self.node.value_list] - tuple_to_insert = tuple(values_to_insert) + tuples_to_insert = [ + tuple(i.value for i in val_node) for val_node in self.node.value_list + ] columns_to_insert = [col_node.name for col_node in self.node.column_list] # Adding all values to Batch for insert - dataframe = pd.DataFrame([tuple_to_insert], columns=columns_to_insert) + dataframe = pd.DataFrame(tuples_to_insert, columns=columns_to_insert) batch = Batch(dataframe) storage_engine = StorageEngine.factory(self.db, table_catalog_entry) @@ -75,5 +76,5 @@ def exec(self, *args, **kwargs): execute_query_fetch_all(self.db, create_index_query) yield Batch( - pd.DataFrame([f"Number of rows loaded: {str(len(values_to_insert))}"]) + pd.DataFrame([f"Number of rows loaded: {str(len(tuples_to_insert))}"]) ) diff --git a/evadb/optimizer/operators.py b/evadb/optimizer/operators.py index 5b9bbf78d..2bf3c9bc6 100644 --- a/evadb/optimizer/operators.py +++ b/evadb/optimizer/operators.py @@ -491,7 +491,7 @@ def __hash__(self) -> int: ( super().__hash__(), self.table, - tuple(self.value_list), + tuple(tuple(i) for i in self.value_list), tuple(self.column_list), ) ) diff --git a/evadb/parser/insert_statement.py b/evadb/parser/insert_statement.py index 23510cbd5..17130898d 100644 --- a/evadb/parser/insert_statement.py +++ b/evadb/parser/insert_statement.py @@ -90,6 +90,6 @@ def __hash__(self) -> int: super().__hash__(), self.table_ref, tuple(self.column_list), - tuple(self.value_list), + tuple(tuple(val) for val in self.value_list), ) ) diff --git a/evadb/parser/lark_visitor/_insert_statements.py b/evadb/parser/lark_visitor/_insert_statements.py index e74c54b41..4b474f816 100644 --- a/evadb/parser/lark_visitor/_insert_statements.py +++ b/evadb/parser/lark_visitor/_insert_statements.py @@ -39,9 +39,7 @@ def insert_statement(self, tree): elif child.data == "uid_list": column_list = self.visit(child) elif child.data == "insert_statement_value": - insrt_value = self.visit(child) - # Support only (value1, value2, .... value n) - value_list = insrt_value[0] + value_list = self.visit(child) insert_stmt = InsertTableStatement(table_ref, column_list, value_list) return insert_stmt diff --git a/evadb/plan_nodes/insert_plan.py b/evadb/plan_nodes/insert_plan.py index 4b329aa8b..83308a08d 100644 --- a/evadb/plan_nodes/insert_plan.py +++ b/evadb/plan_nodes/insert_plan.py @@ -55,6 +55,6 @@ def __hash__(self) -> int: super().__hash__(), self.table_ref, tuple(self.column_list), - tuple(self.value_list), + tuple(tuple(val) for val in self.value_list), ) ) diff --git a/test/integration_tests/short/test_insert_executor.py b/test/integration_tests/short/test_insert_executor.py index 28e520e6f..c22bf8707 100644 --- a/test/integration_tests/short/test_insert_executor.py +++ b/test/integration_tests/short/test_insert_executor.py @@ -43,10 +43,21 @@ def setUp(self): """ execute_query_fetch_all(self.evadb, query) + query = """CREATE TABLE IF NOT EXISTS books + ( + name TEXT(100), + author TEXT(100), + year INTEGER + ); + """ + execute_query_fetch_all(self.evadb, query) + def tearDown(self): shutdown_ray() file_remove("dummy.avi") + execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS books;") + # integration test @unittest.skip("Not supported in current version") def test_should_load_video_in_table(self): @@ -111,3 +122,98 @@ def test_should_insert_tuples_in_table(self): query = """SELECT name FROM CSVTable WHERE name LIKE '.*(sad|happy)';""" batch = execute_query_fetch_all(self.evadb, query) self.assertEqual(len(batch._frames), 2) + + def test_insert_one_tuple_in_table(self): + query = """ + INSERT INTO books (name, author, year) VALUES ( + 'Harry Potter', 'JK Rowling', 1997 + ); + """ + execute_query_fetch_all(self.evadb, query) + query = "SELECT * FROM books;" + batch = execute_query_fetch_all(self.evadb, query) + logger.info(batch) + + self.assertIsNone( + np.testing.assert_array_equal( + batch.frames["books.name"].array, + np.array( + [ + "Harry Potter", + ] + ), + ) + ) + + self.assertIsNone( + np.testing.assert_array_equal( + batch.frames["books.author"].array, + np.array( + [ + "JK Rowling", + ] + ), + ) + ) + + self.assertIsNone( + np.testing.assert_array_equal( + batch.frames["books.year"].array, + np.array( + [ + 1997, + ] + ), + ) + ) + + def test_insert_multiple_tuples_in_table(self): + query = """ + INSERT INTO books (name, author, year) VALUES + ('Fantastic Beasts Collection', 'JK Rowling', 2001), + ('Magic Tree House Collection', 'Mary Pope Osborne', 1992), + ('Sherlock Holmes', 'Arthur Conan Doyle', 1887); + """ + execute_query_fetch_all(self.evadb, query) + query = "SELECT * FROM books;" + batch = execute_query_fetch_all(self.evadb, query) + logger.info(batch) + + self.assertIsNone( + np.testing.assert_array_equal( + batch.frames["books.name"].array, + np.array( + [ + "Fantastic Beasts Collection", + "Magic Tree House Collection", + "Sherlock Holmes", + ] + ), + ) + ) + + self.assertIsNone( + np.testing.assert_array_equal( + batch.frames["books.author"].array, + np.array( + [ + "JK Rowling", + "Mary Pope Osborne", + "Arthur Conan Doyle", + ] + ), + ) + ) + + self.assertIsNone( + np.testing.assert_array_equal( + batch.frames["books.year"].array, + np.array( + [ + 2001, + 1992, + 1887, + ] + ), + ) + ) diff --git a/test/unit_tests/parser/test_parser.py b/test/unit_tests/parser/test_parser.py index 3091e8f3d..be32f3836 100644 --- a/test/unit_tests/parser/test_parser.py +++ b/test/unit_tests/parser/test_parser.py @@ -827,8 +827,10 @@ def test_insert_statement(self): TupleValueExpression("Frame_Path"), ], [ - ConstantValueExpression(1), - ConstantValueExpression("/mnt/frames/1.png", ColumnType.TEXT), + [ + ConstantValueExpression(1), + ConstantValueExpression("/mnt/frames/1.png", ColumnType.TEXT), + ] ], ) evadb_statement_list = parser.parse(insert_query)