diff --git a/CHANGES.rst b/CHANGES.rst index aa3d78d28c..060ce935b7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -242,6 +242,12 @@ xmatch - Minor internal change to use VOTable as the response format that include units, too. [#1375] +wfau +^^^^ + +- Minor enhancement to enable getting tables of images to download instead + of just raw URLs. Tables include metadata about deprecation [#2809] + Infrastructure, Utility and Other Changes and Additions ------------------------------------------------------- diff --git a/astroquery/ukidss/tests/data/image_results.html b/astroquery/ukidss/tests/data/image_results.html deleted file mode 100644 index 138a48c8eb..0000000000 --- a/astroquery/ukidss/tests/data/image_results.html +++ /dev/null @@ -1,22 +0,0 @@ -  -

GetImage cut-out results

-
J2000 coords: RA: 83.6330757 Dec:22.014436 -
Programme: All UKIDSS surveys -
Filter: all -
Processing ... -
Connecting to database: UKIDSSDR7PLUS

- - - - - - - - - - -
LinkmultiframeIDframetypeobstypefilteridshortnamedateObsextNum
show1737581leavstackOBJECT5K2007-10-11 13:12:05.55
-1 rows returned. diff --git a/astroquery/ukidss/tests/data/image_results_noradius.html b/astroquery/ukidss/tests/data/image_results_noradius.html new file mode 100644 index 0000000000..c66ee27fb4 --- /dev/null +++ b/astroquery/ukidss/tests/data/image_results_noradius.html @@ -0,0 +1,43 @@ +  +

GetImage cut-out results

+
J2000 coords: RA: 83.633083 Dec:22.0145 +
Programme: UKIDSS Galactic Clusters Survey, GCS +
Filter: K +
Processing ... +
Connecting to database: UKIDSSDR11PLUS

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LinkmultiframeIDframetypeobstypefilteridshortnamedateObsextNumdeprecated
show1737581leavstackOBJECT5K2007-10-11 13:12:05.550
show1737579leavOBJECT5K2007-10-11 13:12:05.550
show1737587leavOBJECT5K2007-10-11 13:12:53.850
+3 rows returned. diff --git a/astroquery/ukidss/tests/data/image_results_radius.html b/astroquery/ukidss/tests/data/image_results_radius.html new file mode 100644 index 0000000000..0a8503ad79 --- /dev/null +++ b/astroquery/ukidss/tests/data/image_results_radius.html @@ -0,0 +1,147 @@ + + + + + + + +WSA ImageList + + + + + + +
WSA ImageList   

+Not logged in: links will only be returned for frames that are publicly accessible

+Archive Listing

Searching...
+Survey: UKIDSS Galactic Clusters Survey, GCS
+Waveband: K
+Minimum RA: 5.551333333333333 hours Maximum RA: 5.599333333333333 hours
+Minimum Dec: 21.68116666666667 degrees Maximum Dec: 22.347833333333334 degrees
+ +
Using database: UKIDSSDR11PLUS +

+ + +
View column linkshows jpeg images of multiframe in a new window plus links to download file(s)
Img column linkdownload the RICE compressed FITS image file. Use View column link to retrieve uncompressed images.
Cat column linkdownload the FITS catalogue file.
+
begin row 1
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ViewImgCatmultiframeIDframeTypeobstyperaBasedecBaseshortnameexptimedateObsprojectnumDetectorsukirtRunNo
viewFITSFITS1737553leavstackOBJECT+5.5777306+21.7913333K+10.0000002007-10-11 13:08:30.0U/UKIDSS/GCS2141802
viewFITS 1737551leavOBJECT+5.5777306+21.7913333K+10.0000002007-10-11 13:08:30.0U/UKIDSS/GCS2141802
viewFITS 1737559leavOBJECT+5.5777306+21.7913333K+10.0000002007-10-11 13:09:18.6U/UKIDSS/GCS2141806
viewFITSFITS1737581leavstackOBJECT+5.5935528+21.7913333K+10.0000002007-10-11 13:12:05.5U/UKIDSS/GCS2141818
viewFITS 1737579leavOBJECT+5.5935528+21.7913333K+10.0000002007-10-11 13:12:05.5U/UKIDSS/GCS2141818
viewFITS 1737587leavOBJECT+5.5935528+21.7913333K+10.0000002007-10-11 13:12:53.8U/UKIDSS/GCS2141822

row(s) 1 to 6 displayed.
+

Back to form (uses Javascript)
+ 
diff --git a/astroquery/ukidss/tests/test_ukidss.py b/astroquery/ukidss/tests/test_ukidss.py
index d95e6268fb..d16afb154a 100644
--- a/astroquery/ukidss/tests/test_ukidss.py
+++ b/astroquery/ukidss/tests/test_ukidss.py
@@ -14,7 +14,8 @@
 from ...exceptions import InvalidQueryError
 
 DATA_FILES = {"vo_results": "vo_results.html",
-              "image_results": "image_results.html",
+              "image_results_noradius": "image_results_noradius.html",
+              "image_results_radius": "image_results_radius.html",
               "image": "image.fits",
               "votable": "votable.xml",
               "error": "error.html"
@@ -74,9 +75,12 @@ def parse_coordinates_mock_return(c):
 
 def get_mockreturn(method='GET', url='default_url',
                    params=None, timeout=10, **kwargs):
-    if "Image" in url:
-        filename = DATA_FILES["image_results"]
-        url = "Image_URL"
+    if "GetImage" in url:
+        filename = DATA_FILES["image_results_noradius"]
+        url = "GetImage"
+    elif "ImageList" in url:
+        filename = DATA_FILES["image_results_radius"]
+        url = "ImageList"
     elif "SQL" in url:
         filename = DATA_FILES["vo_results"]
         url = "SQL_URL"
@@ -114,23 +118,27 @@ def test_get_images_async_1():
 
 def test_get_images_async_2(patch_get, patch_get_readable_fileobj):
 
+    # debug check: get the table first & make sure it has 'deprecated' column as expected
+    tbl = ukidss.core.Ukidss.get_image_table(icrs_skycoord, programme_id="GPS")
+    assert "deprecated" in tbl.colnames
+
     image_urls = ukidss.core.Ukidss.get_images_async(icrs_skycoord, programme_id="GPS")
 
-    assert len(image_urls) == 1
+    assert len(image_urls) == 3
 
 
 def test_get_image_list(patch_get, patch_get_readable_fileobj):
     urls = ukidss.core.Ukidss.get_image_list(
         icrs_skycoord, frame_type="all", waveband="all", programme_id="GPS")
     print(urls)
-    assert len(urls) == 1
+    assert len(urls) == 3
 
 
 def test_extract_urls():
-    with open(data_path(DATA_FILES["image_results"]), 'r') as infile:
+    with open(data_path(DATA_FILES["image_results_radius"]), 'r') as infile:
         html_in = infile.read()
-    urls = ukidss.core.Ukidss.extract_urls(html_in)
-    assert len(urls) == 1
+    urls = ukidss.core.Ukidss._extract_urls(html_in)
+    assert len(urls) == 14
 
 
 def test_query_region(patch_get, patch_get_readable_fileobj):
diff --git a/astroquery/ukidss/tests/test_ukidss_remote.py b/astroquery/ukidss/tests/test_ukidss_remote.py
index 42966fc650..a819d9bd45 100644
--- a/astroquery/ukidss/tests/test_ukidss_remote.py
+++ b/astroquery/ukidss/tests/test_ukidss_remote.py
@@ -58,3 +58,20 @@ def test_query_region_constraints(self):
 
         assert isinstance(table_constraint, Table)
         assert len(table_noconstraint) >= len(table_constraint)
+
+    def test_deprecated_image_list(self):
+        """
+        Regression test for Issue 2808
+        """
+        crd = SkyCoord(ra=211.3194905, dec=54.413845, unit=(u.deg, u.deg))
+        uk = ukidss.core.UkidssClass()
+        uk.database = 'UHSDR2'
+        result = uk.get_image_list(crd, waveband='all', ignore_deprecated=True)
+
+        # this image is not deprecated (deprecated==0)
+        # can't check for exact URL match because URLs include generated 'uniq' strings
+        assert any("file=/disk73/wsa/ingest/fits/20190614_v5/w20190614_00626_st.fit"
+                   in x for x in result)
+        # this image is deprecated (deprecated==80)
+        assert not any("file=/disk53/wsa/ingest/fits/20150129_v5/w20150129_02901_st.fit"
+                       in x for x in result)
diff --git a/astroquery/wfau/core.py b/astroquery/wfau/core.py
index 7df464b142..9dc946810c 100644
--- a/astroquery/wfau/core.py
+++ b/astroquery/wfau/core.py
@@ -6,12 +6,17 @@
 import time
 from math import cos, radians
 import requests
-from bs4 import BeautifulSoup
+try:
+    from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
+except ImportError:
+    # workaround: older versions of bs4, which we still support, didn't have this warning
+    XMLParsedAsHTMLWarning = object
 from io import BytesIO, StringIO
 
 import astropy.units as u
 import astropy.coordinates as coord
 import astropy.io.votable as votable
+from astropy.io import ascii
 
 from ..query import QueryWithLogin
 from ..exceptions import InvalidQueryError, TimeoutError, NoResultsWarning
@@ -290,10 +295,55 @@ def get_images_async(self, coordinates, *, waveband='all', frame_type='stack',
                                       show_progress=show_progress)
                 for url in image_urls]
 
-    def get_image_list(self, coordinates, *, waveband='all', frame_type='stack',
-                       image_width=1 * u.arcmin, image_height=None,
-                       radius=None, database=None,
-                       programme_id=None, get_query_payload=False):
+    def get_image_list(self, coordinates, *, radius=None, ignore_deprecated=True,
+                       get_query_payload=False, **kwargs):
+        """
+        See `get_image_table` for a full list of options.
+
+        This method will return _only_ the URLs requested as a list of URLs.
+
+        Parameters
+        ----------
+        ignore_deprecated : bool
+            If set (default: True), only images with the ``deprecated`` flag
+            set to zero will be included
+
+        Returns
+        -------
+        url_list : list of image urls
+
+        """
+        image_table = self.get_image_table(coordinates, radius=radius,
+                                           get_query_payload=get_query_payload,
+                                           **kwargs)
+        if get_query_payload:
+            # actully a payload, not a table
+            return image_table
+
+        if ignore_deprecated and radius is None:
+            image_urls = image_table[image_table['deprecated'] == 0]['Link']
+        elif radius is not None:
+            image_urls = image_table['Img']
+        else:
+            image_urls = image_table['Link']
+
+        # different links for radius queries and simple ones
+        if radius is not None:
+            image_urls = [link for link in image_urls if
+                          ('fits_download' in link and '_cat.fits'
+                           not in link and '_two.fit' not in link)]
+        else:
+            # Not sure this is necessary any more (as of #2809), but it seems
+            # harmless and I'm not removing it until I'm sure
+            image_urls = [link.replace("getImage", "getFImage")
+                          for link in image_urls]
+
+        return image_urls
+
+    def get_image_table(self, coordinates, *, waveband='all', frame_type='stack',
+                        image_width=1 * u.arcmin, image_height=None,
+                        radius=None, database=None,
+                        programme_id=None, get_query_payload=False):
         """
         Function that returns a list of urls from which to download the FITS
         images.
@@ -337,7 +387,9 @@ def get_image_list(self, coordinates, *, waveband='all', frame_type='stack',
 
         Returns
         -------
-        url_list : list of image urls
+        table : Table
+            An astropy table containing the metadata table, including URLs, of
+            the requested files.
 
         """
 
@@ -398,22 +450,49 @@ def get_image_list(self, coordinates, *, waveband='all', frame_type='stack',
         if get_query_payload:
             return request_payload
 
-        response = self._wfau_send_request(query_url, request_payload)
-        response = self._check_page(response.url, "row")
+        initial_response = self._wfau_send_request(query_url, request_payload)
+        self._penultimate_response = initial_response
+        response = self._check_page(initial_response.url, "row")
+        self._last_response = response
 
-        image_urls = self.extract_urls(response.text)
-        # different links for radius queries and simple ones
-        if radius is not None:
-            image_urls = [link for link in image_urls if
-                          ('fits_download' in link and '_cat.fits'
-                           not in link and '_two.fit' not in link)]
-        else:
-            image_urls = [link.replace("getImage", "getFImage")
-                          for link in image_urls]
+        return self.parse_imagequery_page(response.text, radius=radius)
 
-        return image_urls
+    def parse_imagequery_page(self, html_in, radius=None):
+        """
+        Parse the image metadata page
+        """
+        ahref = re.compile(r'href="([a-zA-Z0-9_\.&\?=%/:-]+)"')
 
-    def extract_urls(self, html_in):
+        if radius is not None:
+            html = "\n".join([
+                # for radius searches, "FITS" needs to be s/FITS/url/
+                row.replace(">FITS<", ">{}<".format(ahref.search(row).groups()[0])) if ">FITS<" in row else
+                row
+                for row in html_in.split("\n")])
+            with warnings.catch_warnings():
+                # this is really html; the xml parser doesn't work
+                warnings.simplefilter(action="ignore", category=XMLParsedAsHTMLWarning)
+                soup = BeautifulSoup(html, features='html5lib')
+            httb = soup.findAll('table')[2]
+            firstrow = httb.findAll('tr')[0]
+            for td in firstrow.findAll('td'):
+                td.name = 'th'
+            return ascii.read(str(httb), format='html')
+
+        else:
+            html = "\n".join([
+                # for ascii.read: th -> header
+                row.replace("td", "th") if row.startswith("show<", ">{}<".format(ahref.search(row).groups()[0])) if ">show<" in row else
+                row
+                for row in html_in.split("\n")])
+            with warnings.catch_warnings():
+                # ascii.read uses bs4, result is html, not xml, despite xml tag
+                warnings.simplefilter(action="ignore", category=XMLParsedAsHTMLWarning)
+                return ascii.read(html, format='html')
+
+    def _extract_urls(self, html_in):
         """
         Helper function that uses regexps to extract the image urls from the
         given HTML.
@@ -601,7 +680,7 @@ def _parse_result(self, response, *, verbose=False):
         -------
         table : `~astropy.table.Table`
         """
-        table_links = self.extract_urls(response.text)
+        table_links = self._extract_urls(response.text)
         # keep only one link that is not a webstart
         if len(table_links) == 0:
             raise Exception("No VOTable found on returned webpage!")