Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add retries to download functions #59

Merged
merged 3 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions iatikit/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,31 @@
import zipfile

import requests
from requests.adapters import HTTPAdapter
from urllib3 import Retry

from ..standard.codelist import CodelistSet
from .config import CONFIG
from . import helpers


http_adapter = HTTPAdapter(max_retries=Retry(total=3))


def data():
session = requests.Session()
session.mount('https://', http_adapter)
path = CONFIG['paths']['registry']
# downloads from https://iati-data-dump.codeforiati.org
download_url = 'https://iati-data-dump.codeforiati.org/download'
response = requests.get(download_url)
response = session.get(download_url)
data_url = response.text.strip()
shutil.rmtree(path, ignore_errors=True)
makedirs(path)
zip_filepath = join(path, 'iati_dump.zip')

logging.getLogger(__name__).info('Downloading all IATI registry data...')
response = requests.get(data_url, stream=True)
response = session.get(data_url, stream=True)
with open(zip_filepath, 'wb') as handler:
shutil.copyfileobj(response.raw, handler)
logging.getLogger(__name__).info('Unzipping data...')
Expand All @@ -39,6 +46,8 @@ def data():


def metadata():
session = requests.Session()
session.mount('https://', http_adapter)
logging.getLogger(__name__).info(
'Downloading metadata from the IATI registry...')
path = join(CONFIG['paths']['registry'], 'metadata')
Expand All @@ -51,7 +60,7 @@ def metadata():
'?id={org_slug}'
start = 0
while True:
j = requests.get(url_tmpl.format(start=start)).json()
j = session.get(url_tmpl.format(start=start)).json()
if len(j['result']['results']) == 0:
break
for res in j['result']['results']:
Expand All @@ -60,7 +69,7 @@ def metadata():
continue
org_name = org['name']
if not exists(join(path, org_name + '.json')):
j = requests.get(org_url_tmpl.format(org_slug=org_name)).json()
j = session.get(org_url_tmpl.format(org_slug=org_name)).json()
with open(join(path, org_name + '.json'), 'w') as f:
json.dump(j['result'], f)
dataset_name = res['name']
Expand Down Expand Up @@ -94,6 +103,9 @@ def metadata():
def _get_codelist_mappings(versions):
all_codelists = CodelistSet()

session = requests.Session()
session.mount('https://', http_adapter)

path = join(CONFIG['paths']['standard'], 'codelist_mappings')
shutil.rmtree(path, ignore_errors=True)
makedirs(path)
Expand All @@ -110,7 +122,7 @@ def _get_codelist_mappings(versions):
makedirs(mapping_path)

mapping_url = tmpl.format(version=version_path)
mappings = requests.get(mapping_url).json()
mappings = session.get(mapping_url).json()

activity_mappings = [
x for x in mappings
Expand All @@ -129,40 +141,44 @@ def _get_codelist_mappings(versions):

def codelists():
def get_list_of_codelists(version):
session = requests.Session()
session.mount('https://', http_adapter)
if version in _VERY_OLD_IATI_VERSIONS:
request = requests.get(_VERY_OLD_CODELISTS_URL)
request = session.get(_VERY_OLD_CODELISTS_URL)
# import pdb; pdb.set_trace()
list_of_codelists = [x['name'] for x in csv.DictReader(
[x.decode() for x in request.iter_lines()])]
elif version in _OLD_IATI_VERSIONS:
j = requests.get(_OLD_CODELISTS_URL).json()
j = session.get(_OLD_CODELISTS_URL).json()
list_of_codelists = [x['name'] for x in j['codelist']]
else:
codelists_url = _NEW_CODELISTS_TMPL.format(
version=version.replace('.', ''))
list_of_codelists = requests.get(codelists_url).json()
list_of_codelists = session.get(codelists_url).json()
return list_of_codelists

def get_codelist(codelist_name, version):
session = requests.Session()
session.mount('https://', http_adapter)
if version in _VERY_OLD_IATI_VERSIONS:
codelist_url = _VERY_OLD_CODELIST_TMPL.format(
codelist_name=codelist_name)
request = requests.get(codelist_url)
request = session.get(codelist_url)
codes = list(csv.DictReader(
[x.decode() for x in request.iter_lines()]))
version_codelist = {'data': codes}
elif version in _OLD_IATI_VERSIONS:
codelist_url = _OLD_CODELIST_TMPL.format(
codelist_name=codelist_name)
request = requests.get(codelist_url)
request = session.get(codelist_url)
codes = list(csv.DictReader(
[x.decode() for x in request.iter_lines()]))
version_codelist = {'data': codes}
else:
codelist_url = _NEW_CODELIST_TMPL.format(
codelist_name=codelist_name,
version=version.replace('.', ''))
version_codelist = requests.get(codelist_url).json()
version_codelist = session.get(codelist_url).json()
return version_codelist

path = join(CONFIG['paths']['standard'], 'codelists')
Expand Down Expand Up @@ -219,14 +235,16 @@ def get_codelist(codelist_name, version):


def schemas():
session = requests.Session()
session.mount('https://', http_adapter)
path = join(CONFIG['paths']['standard'], 'schemas')
shutil.rmtree(path, ignore_errors=True)
makedirs(path)

versions_url = 'https://iatistandard.org/reference_downloads/' + \
'201/codelists/downloads/clv2/json/en/' + \
'Version.json'
versions = [d['code'] for d in requests.get(versions_url).json()['data']]
versions = [d['code'] for d in session.get(versions_url).json()['data']]
versions.reverse()

logging.getLogger(__name__).info('Downloading IATI Standard schemas...')
Expand All @@ -239,7 +257,7 @@ def schemas():
makedirs(join(path, version_path))
for filename in filenames:
url = tmpl.format(version=version, filename=filename)
request = requests.get(url)
request = session.get(url)
filepath = join(path, version_path, filename)
with open(filepath, 'wb') as handler:
handler.write(request.content)
Expand Down
12 changes: 10 additions & 2 deletions iatikit/utils/helpers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
import requests
from requests.adapters import HTTPAdapter
from urllib3 import Retry

http_adapter = HTTPAdapter(max_retries=Retry(total=3))


def get_iati_versions():
session = requests.Session()
session.mount('https://', http_adapter)
versions_url = 'http://reference.iatistandard.org/201/codelists/' + \
'downloads/clv2/json/en/Version.json'
versions = [d['code']
for d in requests.get(versions_url).json()['data']]
versions = [
d['code']
for d in session.get(versions_url).json()['data']
]
versions.reverse()
return versions
25 changes: 16 additions & 9 deletions tests/test_download_codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import shutil
import tempfile
from unittest import TestCase

from mock import patch

from iatikit.utils import download
Expand All @@ -17,8 +16,10 @@ def setUp(self):
config_dict = {'paths': {'standard': self.standard_path}}
CONFIG.read_dict(config_dict)

@patch('requests.get', CodelistMockRequest)
def test_download_codelists(self):
@patch('requests.Session')
def test_download_codelists(self, mock_session):
mock_session.return_value.get.side_effect = CodelistMockRequest

download.codelists()

codelists_expected = {
Expand All @@ -34,8 +35,10 @@ def test_download_codelists(self):
codelists = json.load(handler)
assert codelists == codelists_expected

@patch('requests.get', CodelistMockRequest)
def test_download_codelist_from_until(self):
@patch('requests.Session')
def test_download_codelist_from_until(self, mock_session):
mock_session.return_value.get.side_effect = CodelistMockRequest

download.codelists()

path = join(self.standard_path, 'codelists', 'ActivityStatus.json')
Expand All @@ -47,8 +50,10 @@ def test_download_codelist_from_until(self):
assert vocabs['data']['1']['from'] == '1.01'
assert vocabs['data']['1']['until'] == '2.01'

@patch('requests.get', CodelistMockRequest)
def test_download_codelist_items(self):
@patch('requests.Session')
def test_download_codelist_items(self, mock_session):
mock_session.return_value.get.side_effect = CodelistMockRequest

download.codelists()

path = join(self.standard_path, 'codelists', 'Sector.json')
Expand All @@ -58,8 +63,10 @@ def test_download_codelist_items(self):
sector_name = 'Media and free flow of information'
assert vocabs['data']['15153']['name'] == sector_name

@patch('requests.get', CodelistMockRequest)
def test_download_codelist_mappings(self):
@patch('requests.Session')
def test_download_codelist_mappings(self, mock_session):
mock_session.return_value.get.side_effect = CodelistMockRequest

download.codelists()

path = join(self.standard_path, 'codelist_mappings')
Expand Down
6 changes: 4 additions & 2 deletions tests/test_download_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def setUp(self):
config_dict = {'paths': {'standard': self.standard_path}}
CONFIG.read_dict(config_dict)

@patch('requests.get', MockRequest)
def test_download_schemas(self):
@patch('requests.Session')
def test_download_schemas(self, mock_session):
mock_session.return_value.get.side_effect = MockRequest

download.schemas()

filenames = [
Expand Down