-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreport_retriever.py
421 lines (364 loc) · 20.5 KB
/
report_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
# Copyright 2022, Loic Leray
# Please acknowledge Loic Leray for making this data available in your
# research.
# ---
# See https://github.com/loicleray/OIE_WAHIS.ReportRetriever for documentation
# and explanations.
import argparse
import json
import os
import time
from datetime import date, timedelta
import requests
import numpy as np
import pandas as pd
import pprint
from tqdm import tqdm
# from ast import literal_eval
def get_filter_options():
'''Returns a dictionary with the options and acceptable values to filter
WAHIS diseases.'''
# wahis filter names taked from the filter columns
# on https://wahis.woah.org/#/events
wahis_filter_names = ["country", "region", "epiEventId", "diseases", "diseaseType",
"reason", "eventDate", "eventStatus", "reportHistoryType", "reportDate"]
report_filter_options = {}
try:
for item in wahis_filter_names:
url = f"https://wahis.woah.org/pi/reports/filters?columnName={item}"
# if keeping "payload" and "headers" blank doesn't work you may need to
# cURL, PostMan can generate this code automatically...
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
report_filter_options[item] = response.json()['dropDownValue']
except:
print("Something went wrong when trying to access filter lists.")
return report_filter_options
def save_filter_options(save_path):
'''Save a file with contents of get_filter_options() in file path of your
choosing.'''
file_name = "WAHIS_filter_options"
full_path = os.path.join(save_path, file_name+".json")
filter_options = get_filter_options()
print("Creating file with filter options for you to check...")
file = open(full_path, "w")
json.dump(filter_options, file)
file.close()
print(f"File saved as {file_name}.json in 'OUTPUTS' folder.")
def get_report_list(country=[], region=[], disease=[], start_date=str("1901-01-01"), end_date=str(date.today())):
'''Returns a dictionary of reports corresponding to the filter results
parsed in the funtion. Be carefull, the default is to return all results
from 01/01/1901 to present date. It is your responsibilty to limit results
using filters specified in the documentation (link at top of file).''' # setup
URL = "https://wahis.woah.org/pi/getReportList"
headers = {}
# don't think that i need the header below. keeping temporarily until sure it isn't necessary.
# 'authority': 'wahis.woah.org',
# 'accept': 'application/json',
# 'accept-language': 'en',
# 'access-control-allow-headers': 'X-Requested-With, Content-Type, Origin, Authorization, Accept,Client-Security-Token, Accept-Encoding, accept-language, type, authorizationToken, methodArn',
# 'access-control-allow-methods': 'POST, GET, OPTIONS, DELETE, PUT',
# 'access-control-allow-origin': '*',
# 'cache-control': 'no-cache',
# 'clientid': 'OIEwebsite',
# 'content-type': 'application/json',
# 'cookie': '_ga=GA1.2.1528097890.1659352073; _gid=GA1.2.493419287.1659682725',
# 'env': 'PRD',
# 'expires': 'Sat, 01 Jan 2000 00:00:00 GMT',
# 'origin': 'https://wahis.woah.org',
# 'pragma': 'no-cache',
# 'referer': 'https://wahis.woah.org/',
# 'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'empty',
# 'sec-fetch-mode': 'cors',
# 'sec-fetch-site': 'same-origin',
# 'temporal_reference_date': '2022-07-07 13:34:59',
# 'token': '#PIPRD202006#',
# 'type': 'REQUEST',
# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
# 'x-content-type-options': 'nosniff',
# 'x-frame-options': 'sameorigin'
# }
payload = json.dumps({
"pageNumber": 1,
"pageSize": 1000000000, # 1000000000 is the MAX allowable to be given in payload without "BAD REQUEST ERROR"
"searchText": "", # Not yet added to the possible arguments for get_report_list()
"sortColName": "", # Not yet added to the possible arguments for get_report_list()
# "ASC" = Ascending oldest to newest, "DESC" = Descending oldest to newest.
"sortColOrder": "DESC",
"reportFilters": {
"country": country, # See list wahis_filter_names['country'] for possible options/
"region": region, # See list wahis_filter_names['region'] for possible options/
"epiEventId": [],
"diseases": disease, # See list wahis_filter_names['disease'] for possible options/
"diseaseType": [], # Not yet added to the possible arguments for get_report_list()
"reason": [], # Not yet added to the possible arguments for get_report_list()
"eventDate": {}, # Not yet added to the possible arguments for get_report_list()
"eventStatus": [], # Not yet added to the possible arguments for get_report_list()
"reportHistoryType": [], # Not yet added to the possible arguments for get_report_list()
"reportDate": {
"startDate": start_date, # Must be in a string in YYYY-MM-DD format. Must Precede endDate
"endDate": end_date # Must be in a string in YYYY-MM-DD format. Must come after startDate
}
},
"languageChanged": False # Not yet added to the possible arguments for get_report_list()
})
response_report_list = requests.request("POST", URL, headers=headers, data=payload)
# JSON RESULTS from POST /pi/getReportList
return response_report_list.json()
def get_report_contents(report_info_id):
'''Returns report data as dictionary for a given reportID.'''
url_report = f'https://wahis.woah.org/pi/getReport/{str(report_info_id)}'
# Don't think payload or headers are needed in this case...
# tk May beed to add solution if code doesn't work on other systems.
payload_report = {}
headers_report = {}
try:
# sleep to reduce server load
time.sleep(0.5)
response_single_report = requests.request(
"GET", url_report, headers=headers_report, data=payload_report)
# save report contents as temporary dict & log progress
contents_single_report = response_single_report.json()
except error as e:
print(e)
return contents_single_report
def make_list_of_outbreaks_in_report(contents_single_report):
'''When given the JSON contents of a WAHIS report this function will parse
and clean outbreaks in the report. That is it extracts outbreaks from
reports. Returning only a list of dictionaries containing parsed outbreak
information.'''
list_of_outbreak_dicts = []
if contents_single_report['eventOutbreakDto']['outbreakMap']:
try:
for key, value in contents_single_report['eventOutbreakDto']['outbreakMap'].items():
# 1. GATHERING OUTBREAK DATA
# merge dicts in list of "speciesDetails"
try:
contents_single_report['eventOutbreakDto']['outbreakMap'][key]['speciesDetails'] = {
k: v for list_item in contents_single_report['eventOutbreakDto']['outbreakMap'][key]['speciesDetails'] for (k, v) in list_item.items()}
except:
print(f'Mistake with outbreak:',
contents_single_report['eventOutbreakDto']['outbreakMap'][key]['oieReference'])
# tk need a better way of dealing with reports that don't have outbreaks within them...
pprint.pprint(contents_single_report['eventOutbreakDto']['outbreakMap'][key])
continue
# 2. CLEANING OUTBREAK DATA
# within'outbreakMap' create key "isWild" from contents of exhisting 'animalCategory'
contents_single_report['eventOutbreakDto']['outbreakMap'][key]['isWild'] = contents_single_report[
'eventOutbreakDto']['outbreakMap'][key]['animalCategory'][0]['isWild']
# remove now uneeded 'animalCategory' with .pop()
contents_single_report['eventOutbreakDto']['outbreakMap'][key].pop(
'animalCategory', None)
# 3. APPEND CLEANED DATA TO OUTBREAK DICTIONARY (list_of_outbreak_info)
list_of_outbreak_dicts.append(
contents_single_report['eventOutbreakDto']['outbreakMap'][key])
except:
print(f"Unable to add {contents_single_report['reportDto']['reportId']}")
return list_of_outbreak_dicts
else:
pass
def append_report_data_to_outbreak(list_of_outbreak_dicts, contents_single_report, hide_contact_info=True):
'''This function appends report data to each outbreak in a list of outbreaks
(from same report). It also has the option of removing identifying
information of people involved with data entry/manipulation in the
government bodies involved in maintaining accurate WAHIS data.'''
# Give the option to hide_contact_info of the people involved with submitting and processing WAHIS data
if hide_contact_info == True:
# make lists of fields to remove from 'enterInfoDto' in report data
enterInfoDto_fields_to_remove = ['enterContactNum',
'enterEmail',
'enterFirstName',
'enterImagePath',
'enterLastName',
'enterTitle',
'recieverFullName']
# make lists of fields to remove from 'senderDto' in report data
senderDto_fields_to_remove = ['role',
'sendPath',
'senderAddress',
'senderEmail',
'senderFax',
'senderFirstName',
'senderFullName',
'senderLastName',
'senderNationalReference',
'senderTelephone',
'senderTitle']
for field in enterInfoDto_fields_to_remove:
contents_single_report['enterInfoDto'].pop(field, None)
for field in senderDto_fields_to_remove:
contents_single_report['senderDto'].pop(field, None)
# because outbreak data has been aquired from make_list_of_outbreaks_in_report()
# remove 'eventOutbreakDto' to avoid duplicate data
contents_single_report.pop('eventOutbreakDto', None)
# for each outbreak in the list of outbreaks add information for report it
# was taken from
for outbreak_dict in list_of_outbreak_dicts:
outbreak_dict.update(contents_single_report)
return list_of_outbreak_dicts
# def report_amalgamator(reports_for_disease, directory, save_rate=250, export_name="WAHIS_ReportOutbreaks"):
# ###### Amalgamate reports and output .xlsx file(s) ######
# file_save_counter = 1
# amalgam_all_report_outbreaks = []
#
# for count, report_object in enumerate(tqdm(reports_for_disease['homePageDto'], desc='Building report sheets: '), 1):
# print(count)
# # build list of outbreak dicts from each report; add report data and metadata to outbreak listing
# # get single report contents
# contents_single_report = get_report_contents(report_object['reportInfoId'])
# # make list of outbreaks in report
# list_of_outbreak_dicts = make_list_of_outbreaks_in_report(contents_single_report)
# # add report metadata to each report_info in outbreak list
# for outbreak in list_of_outbreak_dicts:
# outbreak.update(report_object)
# report_outbreaks = append_report_data_to_outbreak(list_of_outbreak_dicts,
# contents_single_report,
# hide_contact_info=True)
#
# # add report metadata to each item in outbreak list
# amalgam_all_report_outbreaks.extend(report_outbreaks)
# # once added to "definitive" list, delete temp variables
# del contents_single_report, list_of_outbreak_dicts, report_outbreaks
#
# # Save a csv every parsed_args.save_rate^th report time
# if count % save_rate == 0 or count == len(reports_for_disease['homePageDto']):
# # Make pandas dataframe ( aka "table") with data we've built up to this point
# df_amalgam_all_report_outbreaks = pd.json_normalize(
# amalgam_all_report_outbreaks, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.', max_level=None)
# # df_amalgam_all_report_outbreaks = pd.DataFrame.from_records(amalgam_all_report_outbreaks, index=None, exclude=None, columns=None, coerce_float=False, nrows=None)
# try:
# df_amalgam_all_report_outbreaks.to_excel(
# f'{OUTPUT_DIRECTORY}/{export_name}_{file_save_counter}.xlsx', index=False)
# except:
# df_amalgam_all_report_outbreaks.to_csv(
# f'{OUTPUT_DIRECTORY}/{export_name}_{file_save_counter}.csv', index=False)
# # once the outbreak dataframe has been exported as a csv we no longer need it. Delete to save RAM.
# del df_amalgam_all_report_outbreaks
# # clear all contents of the list of outbreaks
# amalgam_all_report_outbreaks.clear()
# file_save_counter += 1
#
# return f'{directory}/{export_name} is done.'
def main():
##############################
### Parsing User Arguments ###
##############################
parser = argparse.ArgumentParser(description="Gather WAHIS reports based on user's filters.")
parser.add_argument("-op",
"--options",
action="store_true",
help=" Creates a file with the posible filter options for limiting your report results. See OUTPUTS folder in you current working directory.",
)
parser.add_argument("-c",
"--country",
type=str,
nargs="*",
default=[],
help="After flag ('-c' or '--country') add countries for which you want report results seperated by a single space. E.G. '-c France Germany Ethiopia'",
)
parser.add_argument("-r",
"--region",
type=str,
nargs="*",
default=[],
help="After flag ('-r' or '--region') add regions for which you want report results seperated by a single space. E.G. '-r Oceana Asia Europe'",
)
parser.add_argument("-d",
"--disease",
type=str,
nargs="*",
default=[],
help="Disease(s) of interst, entered in bewteem apostrophes seperated by spaces. Be carefull to add full official name. EG: -d 'Anthrax ' 'Morbillivirus (Inf. with)(marine mammals)(2008-)'",
)
parser.add_argument("-sd",
"--start_date",
required=False,
default=str(date.today() - timedelta(days=7)),
type=str,
help="[REQUIRED] Must be in 'YYYY-MM-DD' format and precede given 'end_date'.")
parser.add_argument("-ed",
"--end_date",
required=False,
default=str(date.today()),
type=str,
help="[REQUIRED] Must be in 'YYYY-MM-DD' format and be after given 'start_date'."
)
parser.add_argument("-s",
"--save_rate",
default=250,
type=int,
help="How many reports you want accessed before saving to output to computer. More = more demanding for your computer."
)
parsed_args = parser.parse_args()
# run save_filter_options() based on CLI input
if parsed_args.options == True and not os.path.exists(OUTPUT_DIRECTORY + "WAHIS_filter_options.json"):
save_filter_options(OUTPUT_DIRECTORY)
# get current directory and create new folder "ouputs" if it doesn't exhist
CURRENT_DIRECTORY = os.getcwd()
OUTPUT_DIRECTORY = os.path.join(CURRENT_DIRECTORY, r'OUTPUTS')
if not os.path.exists(OUTPUT_DIRECTORY):
os.makedirs(OUTPUT_DIRECTORY)
EXPORT_NAME = "WAHIS_ReportOutbreaks"
########################
### Main tool logic. ###
########################
if not parsed_args.options:
if (parsed_args.country
or parsed_args.region
or parsed_args.disease
or parsed_args.start_date
or parsed_args.end_date):
###### Get list of reports ######
reports_for_disease = get_report_list(
country=parsed_args.country,
region=parsed_args.region,
disease=parsed_args.disease,
start_date=parsed_args.start_date,
end_date=parsed_args.end_date,)
amalgam_reports_final = []
file_save_counter = 1
print(OUTPUT_DIRECTORY)
for count, report_object in enumerate(tqdm(reports_for_disease['homePageDto'], desc='Gathering Reports...'), 1):
# build list of outbreak dicts from each report; add report data and metadata to outbreak listing
# get single report contents
contents_single_report = get_report_contents(report_object['reportInfoId'])
try:
if contents_single_report['eventOutbreakDto']['outbreakMap']:
list_of_outbreak_dicts = make_list_of_outbreaks_in_report(
contents_single_report)
# add report metadata to each report_info in outbreak list
for outbreak in list_of_outbreak_dicts:
outbreak.update(report_object)
report_outbreaks = append_report_data_to_outbreak(list_of_outbreak_dicts,
contents_single_report,
hide_contact_info=True)
# add report metadata to each item in outbreak list
amalgam_reports_final.extend(report_outbreaks)
# once added to "definitive" list, delete temp variables
del contents_single_report, list_of_outbreak_dicts, report_outbreaks
else:
amalgam_reports_final.extend(report_outbreaks)
except:
print(
f"Report ID: {report_object['reportId']}. Error creating final report amalgam (amalgam_reports_final).")
finally:
# Save a csv every SAVE_RATE^th report time
if count % parsed_args.save_rate == 0 or count == len(reports_for_disease['homePageDto']):
# Make pandas dataframe ( aka "table") with data we've built up to this point
df_amalgam_reports_final = pd.json_normalize(amalgam_reports_final)
# df_amalgam_reports_final = pd.DataFrame.from_records(amalgam_reports_final, index=None, exclude=None, columns=None, coerce_float=False, nrows=None)
try:
df_amalgam_reports_final.to_excel(f'{OUTPUT_DIRECTORY}/{EXPORT_NAME}_{file_save_counter}.xlsx', index=False)
except:
df_amalgam_reports_final.to_csv(f'{OUTPUT_DIRECTORY}/{EXPORT_NAME}_{file_save_counter}.csv', index=False)
# once the outbreak dataframe has been exported as a csv we no longer need it. Delete to save RAM.
del df_amalgam_reports_final
# clear all contents of the list of outbreaks
amalgam_reports_final.clear()
parsed_args.save_rate += 1
if __name__ == "__main__":
main()