-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
113 lines (98 loc) · 3.29 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import collections
import json
import logging
import functools
import re
import altair
import nameparser
json_dump_kwargs = {
'ensure_ascii': False,
'indent': 2,
'sort_keys': True,
}
def tidy_split(df, column, sep='|', keep=False):
"""
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing. Discuss at
http://stackoverflow.com/a/39946744/4651668.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
"""
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
def df_to_vega_lite(df, path=None):
"""
Export a pandas.DataFrame to a vega-lite data JSON.
Params
------
df : pandas.DataFrame
dataframe to convert to JSON
path : None or str
if None, return the JSON str. Else write JSON to the file specified by
path.
"""
chart = altair.Chart(data=df)
data = chart.to_dict()['data']['values']
if path is None:
return json.dumps(data, **json_dump_kwargs)
with open(path, 'w') as write_file:
json.dump(data, write_file, **json_dump_kwargs)
def df_to_datatables(df, path=None, double_precision=5, indent=2):
"""
Convert a pandas dataframe to a JSON object formatted for datatables input.
"""
dump_str = df.to_json(orient='split', double_precision=double_precision)
obj = json.loads(dump_str)
del obj['index']
obj = collections.OrderedDict(obj)
obj.move_to_end('data')
if path is None:
return json.dumps(obj, **json_dump_kwargs)
with open(path, 'w') as write_file:
json.dump(obj, write_file, **json_dump_kwargs)
# Invalid name characters from http://stackoverflow.com/q/1261338/4651668
invalid_name = re.compile(r"[<,\"@/{}*$%?=>:|;#]")
@functools.lru_cache(maxsize=10**6)
def get_standard_author(author):
"""
Given a bioRxiv author, return their name in 'first last' format. Return
`None` if the author is detected to be erroneous or not an invdivual.
"""
author = author.rstrip(',*;')
author_lower = author.lower()
if 'consortium' in author_lower:
logging.info('"{}" removed as a consortium'.format(author))
return None
if 'project' in author_lower:
logging.info('"{}" removed as project'.format(author))
return None
if re.search(invalid_name, author):
logging.info('"{}" removed due to invalid characters'.format(author))
return None
name = nameparser.HumanName(author)
standard_author = '{} {}'.format(name.first, name.last)
return standard_author