Merge pull request #3014 from esdc-esac-esa-int/ESA_gaia_GAIAMNGT-170…

…0_load_data Gaia: change the signature of the method load_data
astropy · Oct 16, 2024 · aa35035 · aa35035
2 parents 22db7f2 + 58f8692
commit aa35035
Show file tree

Hide file tree

Showing 10 changed files with 538 additions and 54 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -181,8 +181,13 @@ gaia
 
 - Fix method search_async_jobs in the class TapPlus. [#2967]
 
+- Change the signature of the function load_data: the parameter output_file that defined the file where the results were
+  saved, is replaced by boolean parameter dump_to_file, that in case it is true, a compressed directory named "datalink_output.zip" with
+  all the DataLink files is made. So the users cannot specified the output file anymore  [#3014]
+
 - New retrieval types for datalink (Gaia DR4 release). [#3110]
 
+
 jplhorizons
 ^^^^^^^^^^^
 
@@ -423,6 +428,8 @@ gaia
   epoch photometry service to return all data associated to a given source.
   [#2376]
 
+- New retrieval types for datalink (Gaia DR4 release). [#3110]
+
 - Default Gaia catalog updated to DR3. [#2596]
 
 heasarc

diff --git a/astroquery/gaia/core.py b/astroquery/gaia/core.py
@@ -13,13 +13,12 @@
 Created on 30 jun. 2016
 Modified on 18 Ene. 2022 by mhsarmiento
 """
+import datetime
 import json
 import os
 import shutil
 import zipfile
 from collections.abc import Iterable
-from datetime import datetime, timezone
-from pathlib import Path
 
 from astropy import units
 from astropy import units as u
@@ -28,6 +27,7 @@
 from astropy.io import votable
 from astropy.table import Table
 from astropy.units import Quantity
+from astropy.utils.decorators import deprecated_renamed_argument
 from requests import HTTPError
 
 from astroquery import log
@@ -168,9 +168,11 @@ def logout(self, *, verbose=False):
         except HTTPError:
             log.error("Error logging out data server")
 
+    @deprecated_renamed_argument("output_file", None, since="0.4.8")
     def load_data(self, ids, *, data_release=None, data_structure='INDIVIDUAL', retrieval_type="ALL",
                   linking_parameter='SOURCE_ID', valid_data=False, band=None, avoid_datatype_check=False,
-                  format="votable", output_file=None, overwrite_output_file=False, verbose=False):
+                  format="votable", dump_to_file=False, overwrite_output_file=False, verbose=False,
+                  output_file=None):
         """Loads the specified table
         TAP+ only
 
@@ -218,44 +220,53 @@ def load_data(self, ids, *, data_release=None, data_structure='INDIVIDUAL', retr
             By default, this value will be set to False. If it is set to 'true'
             the Datalink items tags will not be checked.
         format : str, optional, default 'votable'
-            loading format. Other available formats are 'csv', 'ecsv','votable_plain' and 'fits'
-        output_file : string or pathlib.PosixPath, optional, default None
-            file where the results are saved.
-            If it is not provided, the http response contents are returned.
+            loading format. Other available formats are 'csv', 'ecsv','votable_plain', 'json' and 'fits'
+        dump_to_file: boolean, optional, default False.
+            If it is true, a compressed directory named "datalink_output_<time_stamp>.zip" with all the DataLink
+            files is made in the current working directory. The <time_stamp> format follows the ISO 8601 standard:
+            "yyyymmddThhmmss".
         overwrite_output_file : boolean, optional, default False
-            To overwrite the output_file if it already exists.
+            To overwrite the output file ("datalink_output.zip") if it already exists.
         verbose : bool, optional, default 'False'
             flag to display information about the process
 
         Returns
         -------
         A dictionary where the keys are the file names and its value is a list of astropy.table.table.Table objects
         """
-        now = datetime.now(timezone.utc)
-        now_formatted = now.strftime("%Y%m%d_%H%M%S")
-        temp_dirname = "temp_" + now_formatted
-        downloadname_formated = "download_" + now_formatted
 
         output_file_specified = False
-        if output_file is None:
+
+        now = datetime.datetime.now(datetime.timezone.utc)
+        if not dump_to_file:
+            now_formatted = now.strftime("%Y%m%d_%H%M%S")
+            temp_dirname = "temp_" + now_formatted
+            downloadname_formated = "download_" + now_formatted
             output_file = os.path.join(os.getcwd(), temp_dirname, downloadname_formated)
+
         else:
+            output_file = 'datalink_output_' + now.strftime("%Y%m%dT%H%M%S") + '.zip'
             output_file_specified = True
-
-            if isinstance(output_file, str):
-                if not output_file.lower().endswith('.zip'):
-                    output_file = output_file + '.zip'
-            elif isinstance(output_file, Path):
-                if not output_file.suffix.endswith('.zip'):
-                    output_file.with_suffix('.zip')
-
             output_file = os.path.abspath(output_file)
+            log.info(f"DataLink products will be stored in the {output_file} file")
+
             if not overwrite_output_file and os.path.exists(output_file):
                 raise ValueError(f"{output_file} file already exists. Please use overwrite_output_file='True' to "
                                  f"overwrite output file.")
 
         path = os.path.dirname(output_file)
 
+        log.debug(f"Directory where the data will be saved: {path}")
+
+        if path != '':
+            if not os.path.isdir(path):
+                try:
+                    os.mkdir(path)
+                except FileExistsError:
+                    log.warn("Path %s already exist" % path)
+                except OSError:
+                    log.error("Creation of the directory %s failed" % path)
+
         if avoid_datatype_check is False:
             # we need to check params
             rt = str(retrieval_type).upper()
@@ -298,14 +309,7 @@ def load_data(self, ids, *, data_release=None, data_structure='INDIVIDUAL', retr
             if linking_parameter != 'SOURCE_ID':
                 params_dict['LINKING_PARAMETER'] = linking_parameter
 
-        if path != '':
-            try:
-                os.mkdir(path)
-            except FileExistsError:
-                log.error("Path %s already exist" % path)
-            except OSError:
-                log.error("Creation of the directory %s failed" % path)
-
+        files = dict()
         try:
             self.__gaiadata.load_data(params_dict=params_dict, output_file=output_file, verbose=verbose)
             files = Gaia.__get_data_files(output_file=output_file, path=path)
@@ -314,6 +318,9 @@ def load_data(self, ids, *, data_release=None, data_structure='INDIVIDUAL', retr
         finally:
             if not output_file_specified:
                 shutil.rmtree(path)
+            else:
+                for file in files.keys():
+                    os.remove(os.path.join(os.getcwd(), path, file))
 
         if verbose:
             if output_file_specified:
@@ -329,18 +336,21 @@ def load_data(self, ids, *, data_release=None, data_structure='INDIVIDUAL', retr
     @staticmethod
     def __get_data_files(output_file, path):
         files = {}
-        if zipfile.is_zipfile(output_file):
-            with zipfile.ZipFile(output_file, 'r') as zip_ref:
-                zip_ref.extractall(os.path.dirname(output_file))
+        extracted_files = []
+
+        with zipfile.ZipFile(output_file, "r") as zip_ref:
+            extracted_files.extend(zip_ref.namelist())
+            zip_ref.extractall(os.path.dirname(output_file))
 
         # r=root, d=directories, f = files
         for r, d, f in os.walk(path):
             for file in f:
-                if file.lower().endswith(('.fits', '.xml', '.csv', '.ecsv')):
+                if file in extracted_files:
                     files[file] = os.path.join(r, file)
 
         for key, value in files.items():
-            if '.fits' in key:
+
+            if key.endswith('.fits'):
                 tables = []
                 with fits.open(value) as hduList:
                     num_hdus = len(hduList)
@@ -349,19 +359,20 @@ def __get_data_files(output_file, path):
                         Gaia.correct_table_units(table)
                         tables.append(table)
                     files[key] = tables
-            elif '.xml' in key:
+
+            elif key.endswith('.xml'):
                 tables = []
                 for table in votable.parse(value).iter_tables():
                     tables.append(table)
                 files[key] = tables
 
-            elif '.csv' in key:
+            elif key.endswith('.csv'):
                 tables = []
                 table = Table.read(value, format='ascii.csv', fast_reader=False)
                 tables.append(table)
                 files[key] = tables
 
-            elif '.json' in key:
+            elif key.endswith('.json'):
                 tables = []
                 with open(value) as f:
                     data = json.load(f)

diff --git a/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_csv.zip b/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_csv.zip
diff --git a/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_ecsv.zip b/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_ecsv.zip
diff --git a/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_fits.zip b/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_fits.zip
diff --git a/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_vot.zip b/astroquery/gaia/tests/data/gaia_dr3_source_id_5937083312263887616_dl_products_vot.zip
diff --git a/astroquery/gaia/tests/setup_package.py b/astroquery/gaia/tests/setup_package.py
@@ -10,7 +10,8 @@ def get_package_data():
     paths = [os.path.join('data', '*.vot'),
              os.path.join('data', '*.vot.gz'),
              os.path.join('data', '*.json'),
-             os.path.join('data', '*.ecsv')
+             os.path.join('data', '*.ecsv'),
+             os.path.join('data', '*.zip')
              ]  # etc, add other extensions
     # you can also enlist files individually by names
     # finally construct and return a dict for the sub module