-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathutils.py
150 lines (131 loc) · 5.08 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from collections import defaultdict
from simplejson import JSONDecodeError
import urllib3
from pyquery import PyQuery as pq
from xml.etree import ElementTree
import backoff
import requests
def extract_ctgov_xml(text):
d = pq(text, parser='xml')
data = defaultdict(None)
data['nct_id'] = d('nct_id').text()
data['title'] = d('brief_title').text().strip()
data['overall_status'] = d('overall_status').text().strip()
data['phase'] = d('phase').text().replace("Phase ", "")
data['lead_sponsor'] = d('lead_sponsor agency').text()
data['lead_sponsor_class'] = d('lead_sponsor agency_class').text()
data['study_type'] = d('study_type').text()
data['completion_date'] = d('primary_completion_date').text()
data['results_date'] = d('firstreceived_results_date').text()
data['enrollment'] = d('enrollment').text()
data['disposition_date'] = \
d('firstreceived_results_disposition_date').text()
# The following fields are not currently used, but might
# be useful in future. Note they aren't tested.
# data['results_pmids'] = d('results_reference PMID').text()
data['collaborator'] = d('collaborator')('agency').text()
data['collaborator_class'] = d('collaborator')('agency_class').text()
data['has_drug_intervention'] = False
data['drugs'] = ''
for it in d('intervention'):
e = pq(it)
if e('intervention_type').text() == 'Drug':
data['has_drug_intervention'] = True
data['drugs'] += e('intervention_name').text() + '; '
data['locations'] = d('location_countries country').text()
for k in data:
if data[k] and isinstance(data[k], basestring):
data[k] = data[k].encode('utf8')
return data
def normalise_phase(x):
'''
Set N/A (trials without phases, e.g. device trials) to 5
(i.e. later than phase 2, which is our cutoff for inclusion).
And set multi-phase trials to the earlier phase, e.g.
phase 1/2 trials to 1.
'''
mapping = {
'Early 1': 1,
'1/2': 1,
'2/3': 2,
'1': 1,
'2': 2,
'3': 3,
'4': 4,
'N/A': 5
}
return mapping[x]
def extract_title_from_pubmed_data(text):
try:
tree = ElementTree.fromstring(text)
title = tree.find('.//Article/ArticleTitle')
if title is not None:
title = title.text.encode('utf8')
except ElementTree.ParseError:
print 'ParseError', text
title = ''
return title
def get_pubmed_linked_articles_url(nct_id, completion_date,
query_type):
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
url += 'esearch.fcgi?db=pubmed&retmode=json&term='
url += '(%s[si] OR %s[Title/Abstract]) ' % (nct_id, nct_id)
url += 'AND ("%s"[pdat] : ' % completion_date.strftime('%Y/%m/%d')
url += '"3000"[pdat]) '
if query_type == 'broad':
url += "AND ((clinical[Title/Abstract] AND trial[Title/Abstract]) "
url += "OR clinical trials as topic[MeSH Terms] "
url += "OR clinical trial[Publication Type] "
url += "OR random*[Title/Abstract] "
url += "OR random allocation[MeSH Terms] "
url += "OR therapeutic use[MeSH Subheading])"
elif query_type == 'narrow':
url += "AND (randomized controlled trial[Publication Type] OR "
url += "(randomized[Title/Abstract] "
url += "AND controlled[Title/Abstract] AND trial[Title/Abstract]))"
return url
def extract_pubmed_ids_from_json(data):
ids = []
esearchresult = data['esearchresult']
if 'idlist' in esearchresult:
ids = esearchresult['idlist']
return ids
def is_study_protocol(title):
return (title and 'study protocol' in title.lower())
def get_response(url):
return requests.get(url)
@backoff.on_exception(backoff.expo,
(urllib3.exceptions.HTTPError,
ValueError,
JSONDecodeError,
requests.exceptions.RequestException),
max_tries=10)
def get_pubmed_title(pmid):
'''
Retrieve the title of a PubMed article, from its PMID.
'''
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
url += 'db=pubmed&rettype=abstract&id=%s' % pmid
resp = get_response(url)
title = extract_title_from_pubmed_data(resp.content)
return title
@backoff.on_exception(backoff.expo,
(urllib3.exceptions.HTTPError,
ValueError,
JSONDecodeError,
requests.exceptions.RequestException),
max_tries=10)
def get_pubmed_linked_articles(nct_id, completion_date, query_type):
'''
Given an NCT ID, search PubMed for related results articles.
'''
url = get_pubmed_linked_articles_url(nct_id, completion_date,
query_type)
resp = get_response(url)
data = resp.json()
ids = extract_pubmed_ids_from_json(data)
for id1 in ids[:]:
title = get_pubmed_title(id1)
if is_study_protocol(title):
ids.remove(id1)
return ids