Refactor all downloads and add URL testing / checksum (facebookresear…

…ch#2100) * refractor tasks and add URL testing * black * add google drive URL testing * change in testing google URLs * finalize SHA and fix nlvr * black * remove duplicate import * add deleted files back * Implements DownloadableFile and changes all tasks to use an object of the class * removes task decanlp from testing URL * black * update coco_caption SHA * move test_urls to nightly and change logic * black * catch specfic errors for lint * [long]
ggdupont · Nov 11, 2019 · ea366da · ea366da
1 parent d15e3a5
commit ea366da
Show file tree

Hide file tree

Showing 85 changed files with 1,428 additions and 615 deletions.
diff --git a/parlai/core/build_data.py b/parlai/core/build_data.py
@@ -24,6 +24,86 @@
 from multiprocessing import Pool
 
 
+class DownloadableFile:
+    """
+    A class used to abstract any file that has to be downloaded online.
+
+    Any task that needs to download a file needs to have a list RESOURCES
+    that have objects of this class as elements.
+
+    This class provides the following functionality:
+
+    - Download a file from a URL / Google Drive
+    - Untar the file if zipped
+    - Checksum for the downloaded file
+    - Send HEAD request to validate URL or Google Drive link
+
+    An object of this class needs to be created with:
+
+    - url <string> : URL or Google Drive id to download from
+    - file_name <string> : File name that the file should be named
+    - hashcode <string> : SHA256 hashcode of the downloaded file
+    - zipped <boolean> : False if the file is not compressed
+    - from_google <boolean> : True if the file is from Google Drive
+
+    """
+
+    def __init__(self, url, file_name, hashcode, zipped=True, from_google=False):
+        self.url = url
+        self.file_name = file_name
+        self.hashcode = hashcode
+        self.zipped = zipped
+        self.from_google = from_google
+
+    def checksum(self, dpath):
+        """
+        Checksum on a given file.
+
+        :param dpath: path to the downloaded file.
+        """
+        sha256_hash = hashlib.sha256()
+        with open(os.path.join(dpath, self.file_name), "rb") as f:
+            for byte_block in iter(lambda: f.read(65536), b""):
+                sha256_hash.update(byte_block)
+            if sha256_hash.hexdigest() != self.hashcode:
+                # remove_dir(dpath)
+                raise AssertionError(
+                    f"[ Checksum for {self.file_name} from \n{self.url}\n"
+                    "does not match the expected checksum. Please try again. ]"
+                )
+            else:
+                print("[ Checksum Successful ]")
+
+    def download_file(self, dpath):
+        if self.from_google:
+            download_from_google_drive(self.url, os.path.join(dpath, self.file_name))
+        else:
+            download(self.url, dpath, self.file_name)
+
+        self.checksum(dpath)
+
+        if self.zipped:
+            untar(dpath, self.file_name)
+
+    def check_header(self):
+        """
+        Performs a HEAD request to check if the URL / Google Drive ID is live
+        """
+        session = requests.Session()
+        if self.from_google:
+            URL = 'https://docs.google.com/uc?export=download'
+            response = session.head(URL, params={'id': self.url}, stream=True)
+        else:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
+            }
+            response = session.head(self.url, allow_redirects=True, headers=headers)
+        status = response.status_code
+        session.close()
+
+        assert status == 200
+
+
 def built(path, version_string=None):
     """
     Check if '.built' flag has been set for that task.

diff --git a/parlai/tasks/aqua/build.py b/parlai/tasks/aqua/build.py
@@ -5,12 +5,18 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from parlai.core.build_data import DownloadableFile
 import parlai.core.build_data as build_data
 import os
 import shutil
 
-
-AQUA_BASE_URL = 'https://github.com/deepmind/AQuA/archive/master.zip'
+RESOURCES = [
+    DownloadableFile(
+        'https://github.com/deepmind/AQuA/archive/master.zip',
+        'aqua.zip',
+        '08ea725477f6a8577a7cc1a2ae08c7a56917aa3ec45193f173b298b6b526c603',
+    )
+]
 
 
 def build(opt):
@@ -25,13 +31,8 @@ def build(opt):
             build_data.remove_dir(dpath)
         build_data.make_dir(dpath)
 
-        # download the data.
-        fname = 'aqua.zip'
-
-        build_data.download(AQUA_BASE_URL, dpath, fname)
-
-        # uncompress it
-        build_data.untar(dpath, fname)
+        for downloadable_file in RESOURCES:
+            downloadable_file.download_file(dpath)
 
         base_path = os.path.join(dpath, 'AQuA-master')
         new_path = os.path.join(dpath, 'AQuA')

diff --git a/parlai/tasks/babi/build.py b/parlai/tasks/babi/build.py
@@ -6,9 +6,18 @@
 #
 # Download and build the data if it does not exist.
 
+from parlai.core.build_data import DownloadableFile
 import parlai.core.build_data as build_data
 import os
 
+RESOURCES = [
+    DownloadableFile(
+        'http://parl.ai/downloads/babi/babi.tar.gz',
+        'babi.tar.gz',
+        'f7f0bee187efca0d81c3daac1b162cda4eb7f9505dee5ad6846eabbed3dbf92e',
+    )
+]
+
 
 def build(opt):
     dpath = os.path.join(opt['datapath'], 'bAbI')
@@ -22,10 +31,8 @@ def build(opt):
         build_data.make_dir(dpath)
 
         # Download the data.
-        fname = 'babi.tar.gz'
-        url = 'http://parl.ai/downloads/babi/' + fname
-        build_data.download(url, dpath, fname)
-        build_data.untar(dpath, fname)
+        for downloadable_file in RESOURCES:
+            downloadable_file.download_file(dpath)
 
         # Mark the data as built.
         build_data.mark_done(dpath, version_string=version)
diff --git a/parlai/tasks/booktest/build.py b/parlai/tasks/booktest/build.py
@@ -5,9 +5,18 @@
 # LICENSE file in the root directory of this source tree.
 # Download and build the data if it does not exist.
 
+from parlai.core.build_data import DownloadableFile
 import parlai.core.build_data as build_data
 import os
 
+RESOURCES = [
+    DownloadableFile(
+        'http://parl.ai/downloads/booktest/booktest.tar.bz2',
+        'booktest.tar.bz2',
+        '4079481d19c7681e3256c06ffd2781a230aca4a8d9390f3a5932c33e4b857c9d',
+    )
+]
+
 
 def build(opt):
     dpath = os.path.join(opt['datapath'], 'BookTest')
@@ -21,10 +30,8 @@ def build(opt):
         build_data.make_dir(dpath)
 
         # Download the data.
-        fname = 'booktest.tar.bz2'
-        url = 'http://parl.ai/downloads/booktest/' + fname
-        build_data.download(url, dpath, fname)
-        build_data.untar(dpath, fname)
+        for downloadable_file in RESOURCES:
+            downloadable_file.download_file(dpath)
 
         # Mark the data as built.
         build_data.mark_done(dpath, version_string=version)
diff --git a/parlai/tasks/cbt/build.py b/parlai/tasks/cbt/build.py
@@ -5,9 +5,18 @@
 # LICENSE file in the root directory of this source tree.
 # Download and build the data if it does not exist.
 
+from parlai.core.build_data import DownloadableFile
 import parlai.core.build_data as build_data
 import os
 
+RESOURCES = [
+    DownloadableFile(
+        'http://parl.ai/downloads/cbt/cbt.tar.gz',
+        'cbt.tar.gz',
+        '932df0cadc1337b2a12b4c696b1041c1d1c6d4b6bd319874c6288f02e4a61e92',
+    )
+]
+
 
 def build(opt):
     dpath = os.path.join(opt['datapath'], 'CBT')
@@ -21,10 +30,8 @@ def build(opt):
         build_data.make_dir(dpath)
 
         # Download the data.
-        fname = 'cbt.tar.gz'
-        url = 'http://parl.ai/downloads/cbt/' + fname
-        build_data.download(url, dpath, fname)
-        build_data.untar(dpath, fname)
+        for downloadable_file in RESOURCES:
+            downloadable_file.download_file(dpath)
 
         # Mark the data as built.
         build_data.mark_done(dpath, version_string=version)
diff --git a/parlai/tasks/ccpe/build.py b/parlai/tasks/ccpe/build.py
@@ -3,9 +3,19 @@
 # LICENSE file in the root directory of this source tree.
 # Download and build the data if it does not exist.
 
+from parlai.core.build_data import DownloadableFile
 import parlai.core.build_data as build_data
 import os
 
+RESOURCES = [
+    DownloadableFile(
+        'https://storage.googleapis.com/dialog-data-corpus/CCPE-M-2019/data.json',
+        'ccpe.json',
+        '4ff051ea7ea60cf0f480c911c7e2cfed56434e2e2c9ea8965ac5e26365773f0a',
+        zipped=False,
+    )
+]
+
 
 def build(opt):
     dpath = os.path.join(opt['datapath'], 'CCPE')
@@ -19,9 +29,8 @@ def build(opt):
         build_data.make_dir(dpath)
 
         # Download the data.
-        fname = 'ccpe.json'
-        url = "https://storage.googleapis.com/dialog-data-corpus/CCPE-M-2019/data.json"
-        build_data.download(url, dpath, fname)
+        for downloadable_file in RESOURCES:
+            downloadable_file.download_file(dpath)
 
         # Mark the data as built.
         build_data.mark_done(dpath, version_string=version)
diff --git a/parlai/tasks/clevr/build.py b/parlai/tasks/clevr/build.py
@@ -5,10 +5,20 @@
 # LICENSE file in the root directory of this source tree.
 # Download and build the data if it does not exist.
 
+from parlai.core.build_data import DownloadableFile
 import parlai.core.build_data as build_data
 import os
 
 
+RESOURCES = [
+    DownloadableFile(
+        'https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip',
+        'CLEVR_v1.0.zip',
+        '5cd61cf1096ed20944df93c9adb31e74d189b8459a94f54ba00090e5c59936d1',
+    )
+]
+
+
 def build(opt):
     dpath = os.path.join(opt['datapath'], 'CLEVR')
     version = 'v1.0'
@@ -21,11 +31,8 @@ def build(opt):
         build_data.make_dir(dpath)
 
         # Download the data.
-        fname = 'CLEVR_v1.0.zip'
-        url = 'https://dl.fbaipublicfiles.com/clevr/'
-
-        build_data.download(url + fname, dpath, fname)
-        build_data.untar(dpath, fname)
+        for downloadable_file in RESOURCES:
+            downloadable_file.download_file(dpath)
 
         # Mark the data as built.
         build_data.mark_done(dpath, version_string=version)