-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresult_formatter.py
109 lines (88 loc) · 3.62 KB
/
result_formatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from __future__ import print_function
from __future__ import unicode_literals
import logging
from collections import Counter
def filter_events(results):
"""
Filters out duplicate events, leaving only one unique
(DATE, SOURCE, TARGET, EVENT) tuple per day.
Parameters
----------
results: Dictionary.
PETRARCH-formatted results in the
{StoryID: [(record), (record)]} format.
Returns
-------
formatted_dict: Dictionary.
Contains filtered events. Keys are
(DATE, SOURCE, TARGET, EVENT, COUNTER) tuples,
values are lists of IDs, sources, and issues. The
``COUNTER`` in the tuple is a hackish workaround since each
key has to be unique in the dictionary and the goal is to
have every coded event appear event if it's a duplicate.
Other code will just ignore this counter.
"""
formatted = {}
for story in results:
for event in results[story]:
date = event[0]
src = event[1]
target = event[2]
code = event[3]
if len(event) == 7:
ids = event[4].split(';')
url = event[5]
source = event[6]
issues = ''
else:
issues = event[4]
issues = issues.split(';')
ids = event[5].split(';')
url = event[6]
source = event[7]
event_tuple = (date, src, target, code, 0)
counter = 0
while True:
if event_tuple in formatted:
counter += 1
event_tuple = (date, src, target, code, counter)
else:
break
formatted[event_tuple] = {'issues': Counter(), 'ids': ids,
'sources': [source], 'urls': [url]}
if issues:
issue_splits = [(iss, c) for iss, c in
[issue_str.split(',') for issue_str in
issues]]
for issue, count in issue_splits:
formatted[event_tuple]['issues'][issue] += int(count)
return formatted
def main(results):
"""
Pulls in the coded results from PETRARCH dictionary in the
{StoryID: [(record), (record)]} format and converts it into
(DATE, SOURCE, TARGET, EVENT, COUNTER) tuple format. The ``COUNTER`` in the
tuple is a hackish workaround since each key has to be unique in the
dictionary and the goal is to have every coded event appear event if it's a
duplicate. Other code will just ignore this counter. Returns this new,
filtered event data.
Parameters
----------
results: Dictionary.
PETRARCH-formatted results in the
{StoryID: [(record), (record)]} format.
Returns
-------
formatted_dict: Dictionary.
Contains filtered events. Keys are
(DATE, SOURCE, TARGET, EVENT, COUNTER) tuples,
values are lists of IDs, sources, and issues. The
``COUNTER`` in the tuple is a hackish workaround since each
key has to be unique in the dictionary and the goal is to
have every coded event appear event if it's a duplicate.
Other code will just ignore this counter.
"""
logger = logging.getLogger('pipeline_log')
logger.info('Formatting PETRARCH results.')
formatted = filter_events(results)
return formatted