Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-1700 — fixes to trait mapping pipeline #79

Merged
merged 8 commits into from
Oct 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bin/clinvar_jsons/traits_to_zooma_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import requests

from clinvar_jsons_shared_lib import clinvar_jsons, get_traits_from_json, has_allowed_clinical_significance
from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper
from eva_cttv_pipeline.trait_mapping.utils import json_request


DATE = strftime("%d/%m/%y %H:%M", gmtime())
Expand Down Expand Up @@ -110,7 +110,7 @@ def get_clinvar_accession(clinvar_json):

def get_zooma_uris(trait_name, zooma_host, filters):
url = build_zooma_query(trait_name, filters, zooma_host)
json_response = request_retry_helper(url)
json_response = json_request(url)

if json_response is None:
return None
Expand Down
5 changes: 1 addition & 4 deletions bin/trait_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def launch():

main.main(parser.input_filepath, parser.output_mappings_filepath,
parser.output_curation_filepath, parser.filters, parser.zooma_host,
parser.oxo_target_list, parser.oxo_distance, parser.unattended)
parser.oxo_target_list, parser.oxo_distance)


class ArgParser:
Expand Down Expand Up @@ -39,8 +39,6 @@ def __init__(self, argv):
help="target ontologies to use with OxO")
parser.add_argument("-d", dest="oxo_distance", default=3,
help="distance to use to query OxO.")
parser.add_argument('-u', dest="unattended", action='store_true',
help="unattended launch, hide ETA estimates")

args = parser.parse_args(args=argv[1:])

Expand All @@ -55,7 +53,6 @@ def __init__(self, argv):
self.zooma_host = args.zooma_host
self.oxo_target_list = [target.strip() for target in args.oxo_target_list.split(",")]
self.oxo_distance = args.oxo_distance
self.unattended = args.unattended


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions docs/submit-opentargets-batch.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ A new Java package will be generated in the directory `clinvar-xml-parser/src/ma
Here we transform ClinVar's XML file into a JSON file which can be parsed by the downstream tools, using an XML parser which we (if necessary) updated during the previous step.

```bash
cd ${CODE_ROOT} && ${BSUB_CMDLINE} -M 4G \
cd ${CODE_ROOT} && ${BSUB_CMDLINE} -n 8 -M 4G \
-o ${BATCH_ROOT}/logs/convert_clinvar_files.out \
-e ${BATCH_ROOT}/logs/convert_clinvar_files.err \
java -jar ${CODE_ROOT}/clinvar-xml-parser/target/clinvar-parser-1.0-SNAPSHOT-jar-with-dependencies.jar \
Expand Down Expand Up @@ -168,10 +168,10 @@ The TSV file eventually returned by OpenTargets has these columns:
See information about the trait mapping pipeline [here](trait-mapping-pipeline.md). It is run with the following command:

```bash
cd ${CODE_ROOT} && ${BSUB_CMDLINE} \
cd ${CODE_ROOT} && ${BSUB_CMDLINE} -M 4G \
-o ${BATCH_ROOT}/logs/trait_mapping.out \
-e ${BATCH_ROOT}/logs/trait_mapping.err \
python bin/trait_mapping.py -u \
python bin/trait_mapping.py \
-i ${BATCH_ROOT}/clinvar/clinvar.filtered.json.gz \
-o ${BATCH_ROOT}/trait_mapping/automated_trait_mappings.tsv \
-c ${BATCH_ROOT}/trait_mapping/traits_requiring_curation.tsv
Expand Down
2 changes: 1 addition & 1 deletion eva_cttv_pipeline/evidence_string_generation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import logging
logging.basicConfig()
logger = logging.getLogger(__package__)
logger.setLevel(level=logging.INFO)
logger.setLevel(level=logging.DEBUG)
2 changes: 1 addition & 1 deletion eva_cttv_pipeline/trait_mapping/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import logging
logging.basicConfig()
logger = logging.getLogger(__package__)
logger.setLevel(level=logging.INFO)
logger.setLevel(level=logging.DEBUG)
47 changes: 23 additions & 24 deletions eva_cttv_pipeline/trait_mapping/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import Counter
import csv
import logging
import progressbar
import multiprocessing

from eva_cttv_pipeline.trait_mapping.output import output_trait
from eva_cttv_pipeline.trait_mapping.oxo import get_oxo_results
Expand Down Expand Up @@ -30,8 +30,7 @@ def get_uris_for_oxo(zooma_result_list: list) -> set:
return uri_set


def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list,
oxo_distance: int) -> Trait:
def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list, oxo_distance: int) -> Trait:
"""
Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma
mappings that are in EFO then query OxO with any high confidence mappings not in EFO.
Expand All @@ -45,8 +44,8 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list:
"distance" parameter.
:return: The original trait after querying Zooma and possibly OxO, with any results found.
"""

logger.debug('Processing trait {}'.format(trait.name))

trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host)
trait.process_zooma_results()
if (trait.is_finished
Expand All @@ -55,45 +54,45 @@ def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list:
for mapping in trait.zooma_result_list
for entry in mapping.mapping_list])):
return trait

uris_for_oxo_set = get_uris_for_oxo(trait.zooma_result_list)
if len(uris_for_oxo_set) == 0:
return trait
oxo_input_id_list = uris_to_oxo_format(uris_for_oxo_set)
if len(oxo_input_id_list) == 0:
return trait
trait.oxo_result_list = get_oxo_results(oxo_input_id_list, oxo_target_list, oxo_distance)

if not trait.oxo_result_list:
logger.warning('No OxO mapping for trait {}'.format(trait.name))

logger.debug('No OxO mapping for trait {}'.format(trait.name))
trait.process_oxo_mappings()

return trait


def main(input_filepath, output_mappings_filepath, output_curation_filepath, filters, zooma_host,
oxo_target_list, oxo_distance, unattended):
def main(input_filepath, output_mappings_filepath, output_curation_filepath, filters, zooma_host, oxo_target_list,
oxo_distance):
logger.info('Started parsing trait names')
trait_names_list = parse_trait_names(input_filepath)
trait_names_counter = Counter(trait_names_list)
logger.info("Loaded {} trait names".format(len(trait_names_counter)))

with open(output_mappings_filepath, "w", newline='') as mapping_file, \
open(output_curation_filepath, "wt") as curation_file:
mapping_writer = csv.writer(mapping_file, delimiter="\t")
mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"])
curation_writer = csv.writer(curation_file, delimiter="\t")

trait_names_iterator = trait_names_counter.items()
if not unattended:
progress = progressbar.ProgressBar(max_value=len(trait_names_counter),
widgets=[progressbar.AdaptiveETA(samples=1000)])
trait_names_iterator = progress(trait_names_iterator)

logger.info("Loaded {} trait names".format(len(trait_names_counter)))
for i, (trait_name, freq) in enumerate(trait_names_iterator):
trait = Trait(trait_name, freq)
trait = process_trait(trait, filters, zooma_host, oxo_target_list,
oxo_distance)
logger.info('Processing trait names in parallel')
trait_list = [Trait(trait_name, freq) for trait_name, freq in trait_names_counter.items()]
trait_process_pool = multiprocessing.Pool(processes=12)

processed_trait_list = [
trait_process_pool.apply(
process_trait,
args=(trait, filters, zooma_host, oxo_target_list, oxo_distance)
)
for trait in trait_list
]

for trait in processed_trait_list:
output_trait(trait, mapping_writer, curation_writer)
if unattended and i % 100 == 0:
logger.info("Processed {} records".format(i))

logger.info('Finished processing trait names')
10 changes: 6 additions & 4 deletions eva_cttv_pipeline/trait_mapping/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import requests
import urllib

from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper
from eva_cttv_pipeline.trait_mapping.utils import json_request


OLS_EFO_SERVER = 'https://www.ebi.ac.uk/ols'
Expand All @@ -30,14 +30,15 @@ def get_ontology_label_from_ols(ontology_uri: str) -> str:
:return: Term label for the ontology URI provided in the parameters.
"""
url = build_ols_query(ontology_uri)
json_response = request_retry_helper(url)
json_response = json_request(url)

if not json_response:
return None

# If the '_embedded' section is missing from the response, it means that the term is not found in OLS
if '_embedded' not in json_response:
logger.warning('OLS queried OK but did not return any results for URL {}'.format(url))
if '/medgen/' not in url and '/omim/' not in url:
logger.warning('OLS queried OK but did not return any results for URL {}'.format(url))
return None

# Go through all terms found by the requested identifier and try to find the one where the _identifier_ and the
Expand All @@ -48,7 +49,8 @@ def get_ontology_label_from_ols(ontology_uri: str) -> str:
if term["is_defining_ontology"]:
return term["label"]

logger.warning('OLS queried OK, but there is no defining ontology in its results for URL {}'.format(url))
if '/medgen/' not in url and '/omim/' not in url:
logger.warning('OLS queried OK, but there is no defining ontology in its results for URL {}'.format(url))
return None


Expand Down
11 changes: 9 additions & 2 deletions eva_cttv_pipeline/trait_mapping/oxo.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from functools import total_ordering, lru_cache
import logging
import re
import requests

from eva_cttv_pipeline.trait_mapping.ols import get_ontology_label_from_ols, is_in_efo
from eva_cttv_pipeline.trait_mapping.ols import is_current_and_in_efo
from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper
from eva_cttv_pipeline.trait_mapping.utils import json_request


logger = logging.getLogger(__package__)
Expand Down Expand Up @@ -206,7 +207,13 @@ def get_oxo_results(id_list: list, target_list: list, distance: int) -> list:
"""
url = "https://www.ebi.ac.uk/spot/oxo/api/search?size=5000"
payload = build_oxo_payload(id_list, target_list, distance)
oxo_response = request_retry_helper(url, payload)
try:
oxo_response = json_request(url, payload, requests.post)
except requests.HTTPError:
# Sometimes, OxO fails to process a completely valid request even after several attempts.
# See https://github.com/EBISPOT/OXO/issues/26 for details
logger.error('OxO failed to process request for id_list {} (probably a known bug in OxO)'.format(id_list))
return []

if oxo_response is None:
return []
Expand Down
39 changes: 8 additions & 31 deletions eva_cttv_pipeline/trait_mapping/utils.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,14 @@
import logging
import requests
from retry import retry
logger = logging.getLogger(__package__)


def json_request(url: str, payload: dict) -> dict:
"""Makes a GET request with the specified URL and payload, attempts to parse the result as a JSON string and
return it as a dictionary, on failure raises an exception."""
result = requests.get(url, data=payload)
assert result.ok
@retry(exceptions=(ConnectionError, requests.RequestException), logger=logger,
tries=4, delay=2, backoff=1.2, jitter=(1, 3))
def json_request(url: str, payload: dict = None, method=requests.get) -> dict:
"""Makes a request of a specified type (by default GET) with the specified URL and payload, attempts to parse the
result as a JSON string and return it as a dictionary, on failure raises an exception."""
result = method(url, data=payload)
result.raise_for_status()
return result.json()


def retry_helper(function, kwargs: dict, retry_count: int):
"""
Given a function, make a `retry_count` number of attempts to call it until it returns a value without raising
an exception, and subsequently return this value. If all attempts to run the function are unsuccessful, return None.
:param function: Function that could need multiple attempts to return a value
:param kwargs: Dictionary with function's keyword arguments
:param retry_count: Number of attempts to make
:return: Returned value of the function.
"""
for retry_num in range(retry_count):
try:
return function(**kwargs)
except Exception as e:
logger.warning("Attempt {}: failed running function {} with kwargs {}".format(retry_num, function, kwargs))
logger.warning(e)
logger.warning("Error on last attempt, skipping")
return None


def request_retry_helper(url: str, payload: dict = None, retry_count: int = 4):
"""Makes a GET request with the specified URL and payload via `json_request`, makes several attempts and handles
the exceptions via `retry_helper`."""
return retry_helper(json_request, {'url': url, 'payload': payload}, retry_count)
6 changes: 2 additions & 4 deletions eva_cttv_pipeline/trait_mapping/zooma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

from eva_cttv_pipeline.trait_mapping.ols import get_ontology_label_from_ols, is_current_and_in_efo, is_in_efo
from eva_cttv_pipeline.trait_mapping.utils import request_retry_helper
from eva_cttv_pipeline.trait_mapping.utils import json_request


logger = logging.getLogger(__package__)
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_zooma_results(trait_name: str, filters: dict, zooma_host: str) -> list:
"""

url = build_zooma_query(trait_name, filters, zooma_host)
zooma_response_list = request_retry_helper(url)
zooma_response_list = json_request(url)

if zooma_response_list is None:
return []
Expand All @@ -112,8 +112,6 @@ def get_zooma_results(trait_name: str, filters: dict, zooma_host: str) -> list:
zooma_mapping.ontology_label = label
else:
# If no label is returned (because OLS failed to provide it), keep the existing one from ZOOMA
logger.warning(("Couldn't retrieve ontology label from OLS for trait '{}', using label specified "
"by ZOOMA instead").format(trait_name))
zooma_mapping.ontology_label = zooma_result.zooma_label

uri_is_current_and_in_efo = is_current_and_in_efo(zooma_mapping.uri)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ query-string==0.0.4
request==0.0.2
requests>=2.20.0
requests_mock
retry==0.9.2
setuptools-git==1.1
sh==1.11
six==1.12.0
Expand Down