georgia-tech-db · aayushacharya · Dec 29, 2023 · Dec 29, 2023 · Dec 29, 2023 · Dec 29, 2023
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -39,7 +39,7 @@ parts:
 
   - caption: User Reference
     chapters:
-      - file: source/reference/evaql      
+      - file: source/reference/evaql
         title: Query Language
         sections:
           - file: source/reference/evaql/load_csv
@@ -65,25 +65,26 @@ parts:
 
       - file: source/reference/api
         title: Python API
-      
+
       - file: source/reference/rest_api
         title: REST API
 
       - file: source/reference/databases/index
         title: Data Sources
-        sections: 
+        sections:
           - file: source/reference/databases/postgres
           - file: source/reference/databases/sqlite
           - file: source/reference/databases/mysql
           - file: source/reference/databases/mariadb
           - file: source/reference/databases/clickhouse
           - file: source/reference/databases/github
+          - file: source/reference/databases/arxiv
           - file: source/reference/databases/snowflake
           - file: source/reference/databases/hackernews
 
       - file: source/reference/vector_databases/index
         title: Vector Databases
-        sections: 
+        sections:
           - file: source/reference/vector_databases/faiss
           - file: source/reference/vector_databases/chromadb
           - file: source/reference/vector_databases/qdrant
@@ -106,9 +107,9 @@ parts:
           - file: source/reference/ai/hf
             title: Hugging Face
           - file: source/reference/ai/openai
-            title: OpenAI 
+            title: OpenAI
           - file: source/reference/ai/yolo
-            title: YOLO 
+            title: YOLO
           - file: source/reference/ai/stablediffusion
             title: Stable Diffusion
 
@@ -117,7 +118,7 @@ parts:
 
       - file: source/reference/optimizations
         title: Optimizations
-        
+
       # - file: source/reference/io
       #   title: IO Descriptors
 

diff --git a/docs/source/reference/databases/arxiv.rst b/docs/source/reference/databases/arxiv.rst
@@ -0,0 +1,53 @@
+Arxiv
+==========
+
+The connection to Arxiv is based on the `Arxiv <https://github.com/lukasschwab/arxiv.py>`_ library.
+
+Dependency
+----------
+
+* Arxiv
+
+
+Parameters
+----------
+
+Required:
+
+* ``query`` is the search query in the Arxiv repository. For example, Nuclear Physics.
+* ``max_results`` is the max number of results to display. For example, 10.
+
+Create Connection
+-----------------
+
+.. code-block:: text
+
+   CREATE DATABASE arxiv_data WITH ENGINE = 'arxiv', PARAMETERS = {
+        "query": "Nuclear Physics",
+        "max_results": "10"
+   };
+
+Supported Tables
+----------------
+
+* ``search_results``: Lists the relevant articles in the arxiv repository. Check `table_column_info.py <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/third_party/databases/arxiv/table_column_info.py>`_ for all the available columns in the table.
+
+.. code-block:: sql
+
+   SELECT * FROM arxiv_data.search_results;
+
+Here is the query output:
+
+.. code-block:: 
+
+    +---------------------------------------------------+-----+---------------------------------------------+
+    |                             search_results.title  | ... |                          search_results.doi |
+    |---------------------------------------------------|-----|---------------------------------------------|
+    | Nuclear Symmetry Energy Extracted from Laborat... | ... |               10.1080/10619127.2017.1388681 |
+    | Neutrino astrophysics and its connections to n... | ... |             10.1088/1742-6596/1056/1/012060 |
+    |                                               ... | ... |                                         ... |
+    +---------------------------------------------------+-----+---------------------------------------------+
+
+.. note::
+
+   Looking for another table from Arxiv? You can add a table mapping in `arxiv_handler.py <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/third_party/databases/arxiv/arxiv_handler.py>`_, or simply raise a `Feature Request <https://github.com/georgia-tech-db/evadb/issues/new/choose>`_.
diff --git a/evadb/third_party/databases/arxiv/__init__.py b/evadb/third_party/databases/arxiv/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""arxiv integration"""
diff --git a/evadb/third_party/databases/arxiv/arxiv_handler.py b/evadb/third_party/databases/arxiv/arxiv_handler.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import arxiv
+import pandas as pd
+
+from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS
+from evadb.third_party.databases.types import (
+    DBHandler,
+    DBHandlerResponse,
+    DBHandlerStatus,
+)
+
+
+class ArxivHandler(DBHandler):
+    def __init__(self, name: str, **kwargs):
+        """
+        Initialize the handler.
+        Args:
+            name (str): name of the DB handler instance
+            **kwargs: arbitrary keyword arguments for establishing the connection.
+        """
+        super().__init__(name, **kwargs)
+        self.query = kwargs.get("query", "")
+        self.max_results = int(kwargs.get("max_results", 0))
+
+    @property
+    def supported_table(self):
+        def _arxiv_generator():
+            for eachRow in self.connection.results(
+                arxiv.Search(query=self.query, max_results=self.max_results)
+            ):
+                yield {
+                    property_name: getattr(eachRow, property_name)
+                    for property_name, _ in ARXIV_COLUMNS
+                }
+
+        mapping = {
+            "search_results": {
+                "columns": ARXIV_COLUMNS,
+                "generator": _arxiv_generator(),
+            },
+        }
+        return mapping
+
+    def connect(self):
+        """
+        Set up the connection required by the handler.
+        Returns:
+            DBHandlerStatus
+        """
+        try:
+            self.connection = arxiv.Client()
+            return DBHandlerStatus(status=True)
+        except Exception as e:
+            return DBHandlerStatus(status=False, error=str(e))
+
+    def disconnect(self):
+        """
+        Close any existing connections.
+        """
+        pass
+
+    def check_connection(self) -> DBHandlerStatus:
+        """
+        Check connection to the handler.
+        Returns:
+            DBHandlerStatus
+        """
+        if self.connection:
+            return DBHandlerStatus(status=True)
+        else:
+            return DBHandlerStatus(status=False, error="Not connected to the database.")
+
+    def get_tables(self) -> DBHandlerResponse:
+        """
+        Return the list of tables in the database.
+        Returns:
+            DBHandlerResponse
+        """
+        if not self.connection:
+            return DBHandlerResponse(data=None, error="Not connected to the internet.")
+
+        try:
+            tables_df = pd.DataFrame(
+                list(self.supported_table.keys()), columns=["table_name"]
+            )
+            return DBHandlerResponse(data=tables_df)
+        except Exception as e:
+            return DBHandlerResponse(data=None, error=str(e))
+
+    def get_columns(self, table_name: str) -> DBHandlerResponse:
+        """
+        Returns the list of columns for the given table.
+        Args:
+            table_name (str): name of the table whose columns are to be retrieved.
+        Returns:
+            DBHandlerResponse
+        """
+        if not self.connection:
+            return DBHandlerResponse(data=None, error="Not connected to the database.")
+        try:
+            columns_df = pd.DataFrame(
+                self.supported_table[table_name]["columns"], columns=["name", "dtype"]
+            )
+            return DBHandlerResponse(data=columns_df)
+        except Exception as e:
+            return DBHandlerResponse(data=None, error=str(e))
+
+    def select(self, table_name: str) -> DBHandlerResponse:
+        """
+        Returns a generator that yields the data from the given table.
+        Args:
+            table_name (str): name of the table whose data is to be retrieved.
+        Returns:
+            DBHandlerResponse
+        """
+        if not self.connection:
+            return DBHandlerResponse(data=None, error="Not connected to the database.")
+        try:
+            if table_name not in self.supported_table:
+                return DBHandlerResponse(
+                    data=None,
+                    error="{} is not supported or does not exist.".format(table_name),
+                )
+            # TODO: Projection column trimming optimization opportunity
+            return DBHandlerResponse(
+                data=None,
+                data_generator=self.supported_table[table_name]["generator"],
+            )
+        except Exception as e:
+            return DBHandlerResponse(data=None, error=str(e))
diff --git a/evadb/third_party/databases/arxiv/table_column_info.py b/evadb/third_party/databases/arxiv/table_column_info.py
@@ -0,0 +1,27 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARXIV_COLUMNS = [
+    ["title", str],
+    ["entry_id", str],
+    ["published", str],
+    ["updated", str],
+    ["summary", str],
+    ["authors", object],
+    ["comment", str],
+    ["primary_category", str],
+    ["journal_ref", str],
+    ["doi", str],
+]
diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py
@@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs):
         return mod.SnowFlakeDbHandler(engine, **kwargs)
     elif engine == "github":
         return mod.GithubHandler(engine, **kwargs)
+    elif engine == "arxiv":
+        return mod.ArxivHandler(engine, **kwargs)
     elif engine == "hackernews":
         return mod.HackernewsSearchHandler(engine, **kwargs)
     elif engine == "slack":

diff --git a/test/integration_tests/long/test_arxiv_datasource.py b/test/integration_tests/long/test_arxiv_datasource.py
@@ -0,0 +1,59 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from test.util import get_evadb_for_testing
+
+import pytest
+
+from evadb.server.command_handler import execute_query_fetch_all
+from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS
+
+
+@pytest.mark.notparallel
+class ArxivDataSourceTest(unittest.TestCase):
+    def setUp(self):
+        self.evadb = get_evadb_for_testing()
+        # reset the catalog manager before running each test
+        self.evadb.catalog().reset()
+
+    def tearDown(self):
+        execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS arxiv_data;")
+
+    @pytest.mark.skip(
+        reason="Need https://github.com/georgia-tech-db/evadb/pull/1280 for a cost-based rebatch optimization"
+    )
+    @pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message")
+    def test_should_run_select_query_in_arxiv(self):
+        # Create database.
+        params = {
+            "query": "Nuclear Physics",
+            "max_results": "10",
+        }
+        query = f"""CREATE DATABASE arxiv_data
+                    WITH ENGINE = "arxiv",
+                    PARAMETERS = {params};"""
+        execute_query_fetch_all(self.evadb, query)
+
+        query = "SELECT * FROM arxiv_data.search_results LIMIT 10;"
+        batch = execute_query_fetch_all(self.evadb, query)
+        self.assertEqual(len(batch), 10)
+        expected_column = list(
+            ["search_results.{}".format(col) for col, _ in ARXIV_COLUMNS]
+        )
+        self.assertEqual(batch.columns, expected_column)
+
+
+if __name__ == "__main__":
+    unittest.main()