diff --git a/bin/convert_owl.py b/bin/convert_owl.py index adc2464..4bf5196 100644 --- a/bin/convert_owl.py +++ b/bin/convert_owl.py @@ -1,8 +1,9 @@ """Convert OWL to FHIR""" +import json import os import subprocess from argparse import ArgumentParser -from typing import Dict +from typing import Dict, List import curies import requests @@ -40,28 +41,33 @@ 'url': 'https://github.com/monarch-initiative/mondo/releases/latest/download/mondo.owl', 'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'mondo.owl'), 'id': 'mondo', + 'native_uri_stems': ['http://purl.obolibrary.org/obo/MONDO_'], }, 'comp-loinc': { 'url': 'https://github.com/loinc/comp-loinc/releases/latest/download/merged_reasoned_loinc.owl', 'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'comploinc.owl'), 'id': 'comp-loinc', + 'native_uri_stems': ['https://loinc.org/'], }, 'HPO': { 'url': 'https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/hp-full.owl', 'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'hpo.owl'), 'id': 'HPO', + 'native_uri_stems': ['http://purl.obolibrary.org/obo/HP_'], }, 'rxnorm': { 'url': 'https://data.bioontology.org/' 'ontologies/RXNORM/submissions/23/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb', 'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'RXNORM.ttl'), 'id': 'rxnorm', + 'native_uri_stems': ['http://purl.bioontology.org/ontology/RXNORM/'], }, 'sequence-ontology': { 'url': 'https://data.bioontology.org/' 'ontologies/SO/submissions/304/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb', 'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'so.owl'), 'id': 'sequence-ontology', + 'native_uri_stems': ['http://purl.obolibrary.org/obo/SO_'], }, } @@ -143,7 +149,7 @@ def owl_to_semsql(inpath: str, use_cache=False) -> str: return outpath -def owl_to_obograph(inpath: str, use_cache=False) -> str: +def owl_to_obograph(inpath: str, native_uri_stems: List[str] = None, use_cache=False) -> str: """Convert OWL to Obograph # todo: TTL and RDF also supported? not just OWL?""" # Vars @@ -162,67 +168,27 @@ def owl_to_obograph(inpath: str, use_cache=False) -> str: # graph = parse_results.graph_document.graphs[0] _run_shell_command(command) - # todo: might want to add this patch back and open up an issue, because nico said in issue below shouldn't happen - # - issue would be in OAK, regarding the 'cooked' error # Patch missing roots / etc issue (until resolved: https://github.com/ontodev/robot/issues/1082) - # ! - Deactivated this because I was getting an error about the very same IDs that Chris R was asking for - # Try uploading and see if it works. - # - # - This appears to be mostly a problem in FHIR (and maybe just Obographs) if subClassOf or variation missing, but - # not 100% sure - # - # missing_nodes_from_important_edge_preds = [ - # 'is_a', - # 'http://purl.bioontology.org/ontology/RXNORM/isa', - # 'rdfs:subClassOf', - # 'http://www.w3.org/2000/01/rdf-schema#subClassOf' - # ] - # with open(outpath, 'r') as f: - # data = json.load(f) - # nodes = data['graphs'][0]['nodes'] - # node_ids = set([node['id'] for node in nodes]) - # edges = data['graphs'][0]['edges'] - # edges = [x for x in edges if x['pred'] in missing_nodes_from_important_edge_preds] - # edge_subs = set([edge['sub'] for edge in edges]) - # edge_objs = set([edge['obj'] for edge in edges]) - # edge_ids = edge_subs.union(edge_objs) - # missing = set([x for x in edge_ids if x not in node_ids]) - - # Edge case exclusions - # - This was causing the following error in OAK (I have not made a GH issue): - # - This example was from Mondo - # cooked_entry = Node(id="JsonObj(id='http://www.geneontology.org/formats/oboInOwl#Subset')", ... - # if cooked_entry[key_name] != key: - # > raise ... - # E ValueError: Slot: nodes - attribute id value (JsonObj( - # id='http://www.geneontology.org/formats/oboInOwl#Subset')) - # does not match key (http://www.geneontology.org/formats/oboInOwl#Subset) - # - # Method A: Remove cases - # id_exclusions = [ - # 'http://www.geneontology.org/formats/oboInOwl#Subset' - # ] - # uri_stem_exclusions = [ - # 'http://purl.obolibrary.org/obo/CARO_' - # ] - # for case in id_exclusions: - # if case in missing: - # missing.remove(case) - # missing2 = [] - # for node_id in missing: - # if not any([node_id.startswith(x) for x in uri_stem_exclusions]): - # missing2.append(node_id) - # - # Method B: Keep only dominant IDs - # - Opted not to do - - # if missing2: - # print(f'INFO: The following nodes were found in Obographs edges, but not nodes. Adding missing declarations: ' - # f'{missing}') - # for node_id in missing: - # nodes.append({'id': node_id}) - # with open(outpath, 'w') as f: - # json.dump(data, f) + if native_uri_stems: + with open(outpath, 'r') as f: + data = json.load(f) + nodes = data['graphs'][0]['nodes'] + node_ids = set([node['id'] for node in nodes]) + edges = data['graphs'][0]['edges'] + # edges = [x for x in edges if x['pred'] in missing_nodes_from_important_edge_preds] + edge_subs = set([edge['sub'] for edge in edges]) + edge_objs = set([edge['obj'] for edge in edges]) + edge_ids = edge_subs.union(edge_objs) + missing = set([x for x in edge_ids if x not in node_ids]) # all missing + missing = [x for x in missing if any([x.startswith(y) for y in native_uri_stems])] # filter + + if missing: + print(f'INFO: The following nodes were found in Obographs edges, but not nodes. Adding missing declarations: ' + f'{missing}') + for node_id in missing: + nodes.append({'id': node_id}) + with open(outpath, 'w') as f: + json.dump(data, f) return outpath @@ -231,7 +197,10 @@ def owl_to_obograph(inpath: str, use_cache=False) -> str: # - https://github.com/linkml/linkml/issues/1156 # - https://github.com/ontodev/robot/issues/1079 # - https://github.com/geneontology/obographs/issues/89 -def obograph_to_fhir(inpath: str, out_dir: str, out_filename: str = None, include_all_predicates=False) -> str: +def obograph_to_fhir( + inpath: str, out_dir: str, out_filename: str = None, include_all_predicates=False, + native_uri_stems: List[str] = None +) -> str: """Convert Obograph to FHIR""" converter = OboGraphToFHIRConverter() converter.curie_converter = curies.Converter.from_prefix_map(get_default_prefix_map()) @@ -239,7 +208,13 @@ def obograph_to_fhir(inpath: str, out_dir: str, out_filename: str = None, includ out_path = os.path.join(out_dir, out_filename) if not os.path.exists(out_dir): os.makedirs(out_dir) - converter.dump(gd, out_path, include_all_predicates=include_all_predicates) + # todo: this try/except is only temporary until the latest params from dev version of OAK get released + try: + converter.dump(gd, out_path, include_all_predicates=include_all_predicates, native_uri_stems=native_uri_stems) + except Exception as e: + # TODO: find out what exception it is and handle it + print(e) + converter.dump(gd, out_path, include_all_predicates=include_all_predicates) return out_path @@ -260,7 +235,7 @@ def semsql_to_fhir(inpath: str, out_dir: str, out_filename: str = None, include_ def owl_to_fhir( input_path_or_url: str, out_dir: str = OUTDIR, out_filename: str = None, include_all_predicates=False, retain_intermediaries=False, intermediary_type=['obographs', 'semsql'][0], use_cached_intermediaries=False, - intermediary_outdir: str = None, convert_intermediaries_only=False + intermediary_outdir: str = None, convert_intermediaries_only=False, native_uri_stems: List[str] = None, ) -> str: """Run conversion""" # Download if necessary & determine outpaths @@ -285,19 +260,14 @@ def owl_to_fhir( # Convert if intermediary_type == 'obographs' or input_path.endswith('.ttl'): # semsql only supports .owl - intermediary_path = owl_to_obograph(input_path, use_cached_intermediaries) + intermediary_path = owl_to_obograph(input_path, native_uri_stems, use_cached_intermediaries) obograph_to_fhir( - inpath=intermediary_path, - out_dir=intermediary_outdir, - out_filename=out_filename, - include_all_predicates=include_all_predicates) + inpath=intermediary_path, out_dir=intermediary_outdir, out_filename=out_filename, + native_uri_stems=native_uri_stems, include_all_predicates=include_all_predicates) else: # semsql intermediary_path = owl_to_semsql(input_path, use_cached_intermediaries) semsql_to_fhir( - inpath=intermediary_path, - out_dir=intermediary_outdir, - out_filename=out_filename, - include_all_predicates=include_all_predicates) + inpath=intermediary_path, out_dir=intermediary_outdir, out_filename=out_filename, include_all_predicates=include_all_predicates) if convert_intermediaries_only: return intermediary_path @@ -354,6 +324,15 @@ def cli(): parser.add_argument( '-p', '--include-all-predicates', action='store_true', required=False, default=False, help='Include all predicates in CodeSystem.property and CodeSystem.concept.property, or just is_a/parent?') + parser.add_argument( + '-u', '--native-uri-stems', required=False, nargs='+', + help='A comma-separated list of URI stems that will be used to determine whether a concept is native to ' + 'the CodeSystem. For example, for OMIM, the following URI stems are native: ' + 'https://omim.org/entry/,https://omim.org/phenotypicSeries/PS"' + 'As of 2023-01-15, there is still a bug in the Obographs spec and/or `robot` where certain nodes are not' + ' being converted. This converter adds back the nodes, but to know which ones belong to the CodeSystem ' + 'itself and are not foreign concepts, this parameter is necessary. OAK also makes use of this parameter. ' + 'See also: https://github.com/geneontology/obographs/issues/90') parser.add_argument( '-t', '--intermediary-type', choices=INTERMEDIARY_TYPES, default='obographs', required=False, help='Which type of intermediary to use? First, we convert OWL to that intermediary format, and then we '