separate rel_path from rel_url_path, use one url

conbench · jonkeane · Oct 25, 2022 · Oct 20, 2022 · Oct 25, 2022 · Oct 25, 2022
commit 665f4cbf7f8c30f0b8e04fd7104522db06f484da
diff --git a/README.rst b/README.rst
@@ -263,17 +263,23 @@ datalogistik_metadata.ini
         Schema of the table.
 
     ``url``
-        Download url in case this is a single-file table.
-
-    ``base_url``
-        Base download url in case this is a multi-file table. Each file will append
-        their `rel_path` to this to form the full download url.
+        Download url for the table. This can be: 
+        * A URL specifying the file to be downloaded for that table (which could be a 
+          single file, or a directory that contains many files to be downloaded)
+        * A base URL that is concatenated with ``rel_url_path``s in the ``files`` attribute 
+          if the table is a multi-file table and it is preferable to list out the files
 
     ``files``
         A list of files in this table. Each entry in the list has the following properties:
 
         ``rel_path``
-            Path to the file, relative to the directory of this table.
+            Path to the file(s), relative to the directory of this table. This is the the 
+            location on disk in the cache.
+
+        ``rel_url_path``
+            URL path to the file(s), relative to the directory of this table. This is used 
+            only when downloading the file. This is only necesary when a multi table file has
+            the files that make up the table listed out individually. 
 
         ``file_size``
             Size of the file.

diff --git a/datalogistik/dataset.py b/datalogistik/dataset.py
@@ -357,7 +357,6 @@ def validate_table_files(self, table):
                 f"No metadata found for table {table}, could not perform validation (assuming valid)"
             )
             return True
-
         new_file_listing = self.create_file_listing(table)
         # we can't perform a simple equality check on the whole listing,
         # because the orig_file_listing does not contain the metadata file.
@@ -372,6 +371,8 @@ def validate_table_files(self, table):
             found = None
             for new_file in new_file_listing:
                 if new_file["rel_path"] == orig_file["rel_path"]:
+                    # drop the rel_url_path for comparison, because it's not relevant!
+                    orig_file.pop("rel_url_path", None)
                     found = new_file
                     break
 
@@ -448,7 +449,7 @@ def download(self):
             msg = (
                 "No table entries were found. "
                 "To download a dataset, at least 1 table entry must exist "
-                "that has a 'url' or 'base_url' property."
+                "that has a 'url' property."
             )
             log.error(msg)
             raise ValueError(msg)
@@ -460,51 +461,48 @@ def download(self):
 
             # For now, we always download all tables. So we need to loop through each table
             for table in self.tables:
-
-                # There are 2 possible types downloads:
-                # 1 - The table entry has a url property. Either this table is a single-file, or it is a single
-                # download that will produce multiple files.
-                # 2 - multi file (either single or multi table). The table entry has a base_url property,
-                # and each file has a rel_path property. This is appended to the base_url to form
-                # the download link. The files will be placed in the table directory (generated from the table name).
-
                 # create table dir
                 table_path = self.ensure_table_loc(table)
 
-                # Type 1
-                if table.url:
-                    # Note that table_path will be a file if this is a single-file table,
-                    # and a dir if it is a multi-file table (download_file will produce multiple files)
-                    util.download_file(table.url, output_path=table_path)
-                    util.set_readonly(table_path)
-
-                # Type 2
-                elif table.base_url:
-                    if len(table.files) <= 1:
-                        msg = f"Single-file table '{table.table}' has 'base_url' property set. It should only have a 'url'."
+                for file in table.files:
+                    # Validate that there is a url at all
+                    if not table.url:
+                        msg = (
+                            f"Could not find a url property for Table '{table.table}'."
+                        )
                         log.error(msg)
-                        raise ValueError(msg)
-                    for file in table.files:
-                        # contains the suffix for the download url
-                        rel_path = file.get("rel_path")
-                        if not rel_path:
-                            msg = f"Missing rel_path property for multi-file table '{table.table}'."
+                        raise RuntimeError(msg)
+
+                    # contains the suffix for the download url
+                    rel_url_path = file.get("rel_url_path")
+
+                    # if the filename is not at the end of full_path, join
+                    table_url = table.url
+                    if rel_url_path and not table_url.endswith(rel_url_path):
+                        table_url = table_url + "/" + rel_url_path
+
+                    download_path = table_path
+
+                    # Set the rel_path
+                    file["rel_path"] = download_path.name
+
+                    # if this is a multi file table, then we need to do validate that
+                    # there are rel_paths + append them. All files constituting a table
+                    # must be in a dir with name table.name (created by ensure_table_loc)
+                    # note that the resulting dir structure is not necessarily flat,
+                    # because the table can have multiple levels of partitioning.
+                    if len(table.files) > 1 or table.multi_file:
+                        if not rel_url_path:
+                            msg = f"Missing rel_url_path property for multi-file table '{table.table}'."
                             log.error(msg)
                             raise ValueError(msg)
+                        download_path = download_path / rel_url_path
 
-                        # All files constituting a table must be in a dir with name table.name (created by ensure_table_loc)
-                        # note that the resulting dir structure is not necessarily flat,
-                        # because the table can have multiple levels of partitioning.
-                        download_path = table_path / rel_path
-                        url = table.base_url + "/" + rel_path
-
-                        util.download_file(url, output_path=download_path)
-                        util.set_readonly(download_path)
+                        # but for multi-file tables, we override this with the rel_url_path
+                        file["rel_path"] = rel_url_path
 
-                else:
-                    msg = f"Could not find a url or base_url property for Table '{table.table}'."
-                    log.error(msg)
-                    raise RuntimeError(msg)
+                    util.download_file(table_url, output_path=download_path)
+                    util.set_readonly(download_path)
 
                 # Try validation in case the dataset info contained checksums
                 if not self.validate_table_files(table):

diff --git a/datalogistik/table.py b/datalogistik/table.py
@@ -33,4 +33,3 @@ class Table:
     header_line: Optional[bool] = None
     dim: Optional[List] = field(default_factory=list)
     url: Optional[str] = None
-    base_url: Optional[str] = None
diff --git a/repo.json b/repo.json
@@ -12,7 +12,6 @@
                 "header_line": false,
                 "files": [
                     {
-                        "rel_path": "2016Q4.csv.gz",
                         "file_size": 262125134,
                         "md5": "6e8ff7ae76033c452a9b72350461a4e6"
                     }
@@ -77,7 +76,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "nyctaxi_2010-01.csv.gz",
                         "file_size": 591876633,
                         "md5": "22bd86bf54da91faf26fa8a243644a9f"
                     }
@@ -98,7 +96,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "chi_traffic_2020_Q1.parquet",
                         "file_size": 182895135,
                         "md5": "b53ea738eda7ee051bcf33b3056b83f6"
                     }
@@ -119,7 +116,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "type_strings.parquet",
                         "file_size": 87174822,
                         "md5": "927c0cf481fb3c4870806b096b525d90"
                     }
@@ -140,7 +136,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "type_dict.parquet",
                         "file_size": 2890770,
                         "md5": "4186ad2f3072db6558aba3c535da1b97"
                     }
@@ -161,7 +156,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "type_integers.parquet",
                         "file_size": 15882666,
                         "md5": "7d44caa83025c9c7ef2ded67081d6d68"
                     }
@@ -182,7 +176,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "type_floats.parquet",
                         "file_size": 23851672,
                         "md5": "d083e46ddeff5cc5a3be3ea5c2f7f9d7"
                     }
@@ -203,7 +196,6 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "type_nested.parquet",
                         "file_size": 130538033,
                         "md5": "4bcf6735b00b32fccb6da338262def23"
                     }
@@ -224,12 +216,11 @@
                 ],
                 "files": [
                     {
-                        "rel_path": "type_simple_features.parquet",
                         "file_size": 28637722,
                         "md5": "7776bea2a08e0466f77058676e2bb567"
                     }
                 ]
             }
         ]
     }
-]
+]
diff --git a/tests/fixtures/test_cache/taxi_2013/face7ed/datalogistik_metadata.ini b/tests/fixtures/test_cache/taxi_2013/face7ed/datalogistik_metadata.ini
@@ -7,7 +7,7 @@
     "tables": [
         {
             "table": "taxi_2013",
-            "base_url": "http://www.example.com",
+            "url": "http://www.example.com",
             "files": [
                 {
                     "file_size": 5653,

diff --git a/tests/test_datalogistik.py b/tests/test_datalogistik.py
@@ -299,7 +299,10 @@ def test_compress(comp_string):
 # Integration-style tests
 def test_main(capsys):
     # This should be in the cache already, so no conversion needed
-    exact_dataset = dataset.Dataset(name="fanniemae_sample", format="csv", delim="|")
+    # import pdb; pdb.set_trace()
+    exact_dataset = dataset.Dataset(
+        name="fanniemae_sample", format="csv", delim="|", compression="gzip"
+    )
 
     with pytest.raises(SystemExit) as e:
         datalogistik.main(exact_dataset)
@@ -310,6 +313,11 @@ def test_main(capsys):
     assert captured["name"] == "fanniemae_sample"
     assert captured["format"] == "csv"
     assert isinstance(captured["tables"], dict)
+    # this is the path from the fixtures, if this doesn't match, we've actualy converted and not just found the extant one
+    assert (
+        captured["tables"]["fanniemae_sample"]["path"]
+        == "tests/fixtures/test_cache/fanniemae_sample/a77e575/fanniemae_sample.csv.gz"
+    )
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="windows errors on the cleanup")