Skip to content

Commit

Permalink
Refactor all downloads and add URL testing / checksum (facebookresear…
Browse files Browse the repository at this point in the history
…ch#2100)

* refractor tasks and add URL testing

* black

* add google drive URL testing

* change in testing google URLs

* finalize SHA and fix nlvr

* black

* remove duplicate import

* add deleted files back

* Implements DownloadableFile and changes all tasks to use an object of the class

* removes task decanlp from testing URL

* black

* update coco_caption SHA

* move test_urls to nightly and change logic

* black

* catch specfic errors for lint

* [long]
  • Loading branch information
jp848 authored Nov 11, 2019
1 parent d15e3a5 commit ea366da
Show file tree
Hide file tree
Showing 85 changed files with 1,428 additions and 615 deletions.
80 changes: 80 additions & 0 deletions parlai/core/build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,86 @@
from multiprocessing import Pool


class DownloadableFile:
"""
A class used to abstract any file that has to be downloaded online.
Any task that needs to download a file needs to have a list RESOURCES
that have objects of this class as elements.
This class provides the following functionality:
- Download a file from a URL / Google Drive
- Untar the file if zipped
- Checksum for the downloaded file
- Send HEAD request to validate URL or Google Drive link
An object of this class needs to be created with:
- url <string> : URL or Google Drive id to download from
- file_name <string> : File name that the file should be named
- hashcode <string> : SHA256 hashcode of the downloaded file
- zipped <boolean> : False if the file is not compressed
- from_google <boolean> : True if the file is from Google Drive
"""

def __init__(self, url, file_name, hashcode, zipped=True, from_google=False):
self.url = url
self.file_name = file_name
self.hashcode = hashcode
self.zipped = zipped
self.from_google = from_google

def checksum(self, dpath):
"""
Checksum on a given file.
:param dpath: path to the downloaded file.
"""
sha256_hash = hashlib.sha256()
with open(os.path.join(dpath, self.file_name), "rb") as f:
for byte_block in iter(lambda: f.read(65536), b""):
sha256_hash.update(byte_block)
if sha256_hash.hexdigest() != self.hashcode:
# remove_dir(dpath)
raise AssertionError(
f"[ Checksum for {self.file_name} from \n{self.url}\n"
"does not match the expected checksum. Please try again. ]"
)
else:
print("[ Checksum Successful ]")

def download_file(self, dpath):
if self.from_google:
download_from_google_drive(self.url, os.path.join(dpath, self.file_name))
else:
download(self.url, dpath, self.file_name)

self.checksum(dpath)

if self.zipped:
untar(dpath, self.file_name)

def check_header(self):
"""
Performs a HEAD request to check if the URL / Google Drive ID is live
"""
session = requests.Session()
if self.from_google:
URL = 'https://docs.google.com/uc?export=download'
response = session.head(URL, params={'id': self.url}, stream=True)
else:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
response = session.head(self.url, allow_redirects=True, headers=headers)
status = response.status_code
session.close()

assert status == 200


def built(path, version_string=None):
"""
Check if '.built' flag has been set for that task.
Expand Down
19 changes: 10 additions & 9 deletions parlai/tasks/aqua/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os
import shutil


AQUA_BASE_URL = 'https://github.com/deepmind/AQuA/archive/master.zip'
RESOURCES = [
DownloadableFile(
'https://github.com/deepmind/AQuA/archive/master.zip',
'aqua.zip',
'08ea725477f6a8577a7cc1a2ae08c7a56917aa3ec45193f173b298b6b526c603',
)
]


def build(opt):
Expand All @@ -25,13 +31,8 @@ def build(opt):
build_data.remove_dir(dpath)
build_data.make_dir(dpath)

# download the data.
fname = 'aqua.zip'

build_data.download(AQUA_BASE_URL, dpath, fname)

# uncompress it
build_data.untar(dpath, fname)
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

base_path = os.path.join(dpath, 'AQuA-master')
new_path = os.path.join(dpath, 'AQuA')
Expand Down
15 changes: 11 additions & 4 deletions parlai/tasks/babi/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
#
# Download and build the data if it does not exist.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os

RESOURCES = [
DownloadableFile(
'http://parl.ai/downloads/babi/babi.tar.gz',
'babi.tar.gz',
'f7f0bee187efca0d81c3daac1b162cda4eb7f9505dee5ad6846eabbed3dbf92e',
)
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'bAbI')
Expand All @@ -22,10 +31,8 @@ def build(opt):
build_data.make_dir(dpath)

# Download the data.
fname = 'babi.tar.gz'
url = 'http://parl.ai/downloads/babi/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
15 changes: 11 additions & 4 deletions parlai/tasks/booktest/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,18 @@
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os

RESOURCES = [
DownloadableFile(
'http://parl.ai/downloads/booktest/booktest.tar.bz2',
'booktest.tar.bz2',
'4079481d19c7681e3256c06ffd2781a230aca4a8d9390f3a5932c33e4b857c9d',
)
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'BookTest')
Expand All @@ -21,10 +30,8 @@ def build(opt):
build_data.make_dir(dpath)

# Download the data.
fname = 'booktest.tar.bz2'
url = 'http://parl.ai/downloads/booktest/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
15 changes: 11 additions & 4 deletions parlai/tasks/cbt/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,18 @@
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os

RESOURCES = [
DownloadableFile(
'http://parl.ai/downloads/cbt/cbt.tar.gz',
'cbt.tar.gz',
'932df0cadc1337b2a12b4c696b1041c1d1c6d4b6bd319874c6288f02e4a61e92',
)
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'CBT')
Expand All @@ -21,10 +30,8 @@ def build(opt):
build_data.make_dir(dpath)

# Download the data.
fname = 'cbt.tar.gz'
url = 'http://parl.ai/downloads/cbt/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
15 changes: 12 additions & 3 deletions parlai/tasks/ccpe/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,19 @@
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os

RESOURCES = [
DownloadableFile(
'https://storage.googleapis.com/dialog-data-corpus/CCPE-M-2019/data.json',
'ccpe.json',
'4ff051ea7ea60cf0f480c911c7e2cfed56434e2e2c9ea8965ac5e26365773f0a',
zipped=False,
)
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'CCPE')
Expand All @@ -19,9 +29,8 @@ def build(opt):
build_data.make_dir(dpath)

# Download the data.
fname = 'ccpe.json'
url = "https://storage.googleapis.com/dialog-data-corpus/CCPE-M-2019/data.json"
build_data.download(url, dpath, fname)
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
17 changes: 12 additions & 5 deletions parlai/tasks/clevr/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,20 @@
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

from parlai.core.build_data import DownloadableFile
import parlai.core.build_data as build_data
import os


RESOURCES = [
DownloadableFile(
'https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip',
'CLEVR_v1.0.zip',
'5cd61cf1096ed20944df93c9adb31e74d189b8459a94f54ba00090e5c59936d1',
)
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'CLEVR')
version = 'v1.0'
Expand All @@ -21,11 +31,8 @@ def build(opt):
build_data.make_dir(dpath)

# Download the data.
fname = 'CLEVR_v1.0.zip'
url = 'https://dl.fbaipublicfiles.com/clevr/'

build_data.download(url + fname, dpath, fname)
build_data.untar(dpath, fname)
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
Loading

0 comments on commit ea366da

Please sign in to comment.