From 614d67698eb0c5814325491109f58ac4c2b06937 Mon Sep 17 00:00:00 2001 From: Aayush Acharya Date: Fri, 29 Dec 2023 22:34:13 +0545 Subject: [PATCH] feat: arXiv datasource addition, closes #1161 --- docs/source/reference/databases/arxiv.rst | 53 +++++++ evadb/third_party/databases/arxiv/__init__.py | 15 ++ .../databases/arxiv/arxiv_handler.py | 149 ++++++++++++++++++ .../databases/arxiv/table_column_info.py | 27 ++++ evadb/third_party/databases/interface.py | 2 + 5 files changed, 246 insertions(+) create mode 100644 docs/source/reference/databases/arxiv.rst create mode 100644 evadb/third_party/databases/arxiv/__init__.py create mode 100644 evadb/third_party/databases/arxiv/arxiv_handler.py create mode 100644 evadb/third_party/databases/arxiv/table_column_info.py diff --git a/docs/source/reference/databases/arxiv.rst b/docs/source/reference/databases/arxiv.rst new file mode 100644 index 0000000000..a949cf0c9c --- /dev/null +++ b/docs/source/reference/databases/arxiv.rst @@ -0,0 +1,53 @@ +Arxiv +========== + +The connection to Arxiv is based on the `Arxiv `_ library. + +Dependency +---------- + +* Arxiv + + +Parameters +---------- + +Required: + +* ``query`` is the search query in the Arxiv repository. For example, Nuclear Physics. +* ``max_results`` is the max number of results to display. For example, 10. + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE arxiv_data WITH ENGINE = 'arxiv', PARAMETERS = { + "query": "Nuclear Physics", + "max_results": "10" + }; + +Supported Tables +---------------- + +* ``search_results``: Lists the relevant articles in the arxiv repository. Check `table_column_info.py `_ for all the available columns in the table. + +.. code-block:: sql + + SELECT * FROM arxiv_data.search_results; + +Here is the query output: + +.. code-block:: + + +---------------------------------------------------+-----+---------------------------------------------+ + | search_results.title | ... | search_results.doi | + |---------------------------------------------------|-----|---------------------------------------------| + | Nuclear Symmetry Energy Extracted from Laborat... | ... | 10.1080/10619127.2017.1388681 | + | Neutrino astrophysics and its connections to n... | ... | 10.1088/1742-6596/1056/1/012060 | + | ... | ... | ... | + +---------------------------------------------------+-----+---------------------------------------------+ + +.. note:: + + Looking for another table from Arxiv? You can add a table mapping in `arxiv_handler.py `_, or simply raise a `Feature Request `_. diff --git a/evadb/third_party/databases/arxiv/__init__.py b/evadb/third_party/databases/arxiv/__init__.py new file mode 100644 index 0000000000..7324d51f90 --- /dev/null +++ b/evadb/third_party/databases/arxiv/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""arxiv integration""" \ No newline at end of file diff --git a/evadb/third_party/databases/arxiv/arxiv_handler.py b/evadb/third_party/databases/arxiv/arxiv_handler.py new file mode 100644 index 0000000000..7670f82ea9 --- /dev/null +++ b/evadb/third_party/databases/arxiv/arxiv_handler.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import arxiv +import pandas as pd + +from evadb.third_party.databases.arxiv.table_column_info import ARXIV_COLUMNS +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class ArxivHandler(DBHandler): + def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ + super().__init__(name, **kwargs) + self.query=kwargs.get("query","") + self.max_results=int(kwargs.get("max_results",0)) + + @property + def supported_table(self): + def _arxiv_generator(): + for eachRow in self.connection.results(arxiv.Search( + query=self.query, + max_results=self.max_results + )): + yield { + property_name: getattr(eachRow, property_name) + for property_name, _ in ARXIV_COLUMNS + } + + mapping = { + "search_results": { + "columns": ARXIV_COLUMNS, + "generator": _arxiv_generator(), + }, + } + return mapping + + + + def connect(self): + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ + try: + self.connection=arxiv.Client() + return DBHandlerStatus(status=True) + except Exception as e: + return DBHandlerStatus(status=False, error=str(e)) + + def disconnect(self): + """ + Close any existing connections. + """ + pass + + def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ + if self.connection: + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the database.") + + def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the internet.") + + try: + tables_df = pd.DataFrame( + list(self.supported_table.keys()), columns=["table_name"] + ) + return DBHandlerResponse(data=tables_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + columns_df = pd.DataFrame( + self.supported_table[table_name]["columns"], columns=["name", "dtype"] + ) + return DBHandlerResponse(data=columns_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + + + def select(self, table_name: str) -> DBHandlerResponse: + """ + Returns a generator that yields the data from the given table. + Args: + table_name (str): name of the table whose data is to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + if table_name not in self.supported_table: + return DBHandlerResponse( + data=None, + error="{} is not supported or does not exist.".format(table_name), + ) + # TODO: Projection column trimming optimization opportunity + return DBHandlerResponse( + data=None, + data_generator=self.supported_table[table_name]["generator"], + ) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + \ No newline at end of file diff --git a/evadb/third_party/databases/arxiv/table_column_info.py b/evadb/third_party/databases/arxiv/table_column_info.py new file mode 100644 index 0000000000..66b47b5144 --- /dev/null +++ b/evadb/third_party/databases/arxiv/table_column_info.py @@ -0,0 +1,27 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARXIV_COLUMNS=[ + ["title",str], + ["entry_id",str], + ["published",str], + ["updated",str], + ["summary",str], + ["authors",object], + ["comment",str], + ["primary_category",str], + ["journal_ref",str], + ["doi",str], +] \ No newline at end of file diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index cacb4110f1..eacb41c016 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs): return mod.SnowFlakeDbHandler(engine, **kwargs) elif engine == "github": return mod.GithubHandler(engine, **kwargs) + elif engine == "arxiv": + return mod.ArxivHandler(engine,**kwargs) elif engine == "hackernews": return mod.HackernewsSearchHandler(engine, **kwargs) elif engine == "slack":