-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_gbif_stats.py
116 lines (105 loc) · 4.52 KB
/
get_gbif_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding: utf-8 -*-
import requests
import sys
import json
from dateutil import parser
import pandas as pd
DOCUMENTATION_MESSAGE = """
usage: get_gbif_stats.py <filter_type> <instance_name>
filter_type: you can filter datasets either by country or
organization. To specify this, set this parameter
to "country" or "organization".
instance_name: if you have chosen filter_type "country", then
the instance_name should be the owining
organization's country given as a ISO 639-1
(2 letter) country code. If filter_type is
"organization", instance_name should be the
owning organization's UUID key.
"""
def fetch_dataset_datapage(key, offset, limit):
""" fetch one page with download statistics from GBIF """
params = {'limit': limit, 'offset': offset}
r = requests.get('http://api.gbif.org/v1/occurrence/download/dataset/' + key, params=params)
data = r.json()['results']
return data
def fetch_dataset_data(key):
""" fetch and merge all pages with download statistics for one dataset """
more_results_to_find = True
offset = 0
limit = 100
download_stats = []
while more_results_to_find:
page = fetch_dataset_datapage(key, offset, limit)
download_stats += page
offset += 100
if len(page) == 0:
more_results_to_find = False
print(str(key) + ": " + str(offset))
return download_stats
def get_downloaded_records(key):
""" fetch and aggregate all download statistics for one dataset """
raw_data = fetch_dataset_data(key)
records = [x['numberRecords'] for x in raw_data]
mod_dates = [parser.parse(x['download']['modified']) for x in raw_data]
status_list = [x['download']['status'] for x in raw_data]
data = pd.DataFrame({'numberRecords': records, 'mod_dates': mod_dates, 'status': status_list})
total_nr_downloads = len(data)
if total_nr_downloads > 0:
data['year'] = data['mod_dates'].apply(lambda x: x.year)
result = data
else:
result = None
return result
def fetch_datasets(filter_type=None, instance_name=None):
""" Fetch all datasets for this country """
more_results_to_find = True
offset = 0 # Done: 0,
limit = 20
all_datasets = []
while more_results_to_find:
if filter_type == 'country':
params = {'publishingCountry': instance_name, 'offset': offset, 'limit': limit, 'type': 'sampling_event'}
elif filter_type == 'publisher':
params = {'publishingOrg': instance_name, 'offset': offset, 'limit': limit, 'type': 'sampling_event'}
elif filter_type == 'host':
params = {'hostingOrg': instance_name, 'offset': offset, 'limit': limit, 'type': 'sampling_event'}
r = requests.get('http://api.gbif.org/v1/dataset/search', params=params)
datasets = r.json()['results']
all_datasets += datasets
offset += 20
# if len(datasets) == 0:
more_results_to_find = False
return all_datasets
def write_stats(datasets):
""" Fetch dataset stats and print csv report """
IGNORE_PUBLISHERS = []
all_data = []
for ds in datasets:
publisher = ds['publishingOrganizationTitle']
if publisher not in IGNORE_PUBLISHERS:
try:
download_data = get_downloaded_records(ds['key'])
if download_data is None:
print('no downloads for {0}'.format(ds['key']))
else:
download_data['dataset_key'] = [ds['key']] * len(download_data)
all_data.append(download_data)
except Exception as e:
print('Problem with dataset {0}'.format(ds))
all_df = pd.concat(all_data)
tot_data_succeeded = all_df[(all_df['status'] == 'SUCCEEDED') | (all_df['status'] == 'FILE_ERASED')][['numberRecords', 'year', 'dataset_key']]
result = tot_data_succeeded.groupby(['year', 'dataset_key']).agg({'numberRecords': {'recordsDownloaded': sum, 'downloadEvents': len}})
print(result.to_csv())
def main():
if len(sys.argv) != 3:
print(DOCUMENTATION_MESSAGE)
sys.exit(-1)
filter_type = sys.argv[1]
if filter_type not in ['country', 'publisher', 'host']:
print('filter_type should be "country", "publisher" or "host"')
sys.exit(-1)
instance_name = sys.argv[2]
datasets = fetch_datasets(filter_type=filter_type, instance_name=instance_name)
write_stats(datasets)
if __name__ == '__main__':
main()