Skip to content

Commit

Permalink
OWL on FHIR
Browse files Browse the repository at this point in the history
- Add: param: --native-uri-stems: A comma-separated list of URI stems that will be used to determine whether a concept is native to the CodeSystem. As of 2023-01-15, there is still a bug in the Obographs spec and/or `robot` where certain nodes are not being converted. This converter adds back the nodes, but to know which ones belong to the CodeSystem itself and are not foreign concepts, this parameter is necessary. OAK also makes use of this parameter. See also: geneontology/obographs#90
- Add: params: --dev-oak-path and --dev-oak-interpreter-path: If you want to use a local development version of OAK, specify these paths.
  • Loading branch information
joeflack4 committed Jan 16, 2023
1 parent 4b2dbcc commit 2f46a1f
Showing 1 changed file with 132 additions and 103 deletions.
235 changes: 132 additions & 103 deletions bin/convert_owl.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Convert OWL to FHIR"""
import json
import os
import subprocess
from argparse import ArgumentParser
from typing import Dict
from collections import OrderedDict
from typing import Dict, List

import curies
import requests
Expand Down Expand Up @@ -33,37 +35,47 @@
'intermediary_type': 'obographs',
'use_cached_intermediaries': True,
'retain_intermediaries': True,
'convert_intermediaries_only': True,
'convert_intermediaries_only': False,
}
FAVORITE_ONTOLOGIES = {
FAVORITE_ONTOLOGIES = OrderedDict({
'mondo': {
'url': 'https://github.com/monarch-initiative/mondo/releases/latest/download/mondo.owl',
'download_url': 'https://github.com/monarch-initiative/mondo/releases/latest/download/mondo.owl',
'code_system_url': 'http://purl.obolibrary.org/obo/mondo.owl',
'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'mondo.owl'),
'id': 'mondo',
'code_system_id': 'mondo',
'native_uri_stems': ['http://purl.obolibrary.org/obo/MONDO_'],
},
'comp-loinc': {
'url': 'https://github.com/loinc/comp-loinc/releases/latest/download/merged_reasoned_loinc.owl',
'download_url': 'https://github.com/loinc/comp-loinc/releases/latest/download/merged_reasoned_loinc.owl',
'code_system_url': 'https://github.com/loinc/comp-loinc/releases/latest/download/merged_reasoned_loinc.owl',
'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'comploinc.owl'),
'id': 'comp-loinc',
'code_system_id': 'comp-loinc',
'native_uri_stems': ['https://loinc.org/'],
},
'HPO': {
'url': 'https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/hp-full.owl',
'download_url': 'https://github.com/obophenotype/human-phenotype-ontology/releases/latest/download/hp-full.owl',
'code_system_url': 'http://purl.obolibrary.org/obo/hp.owl',
'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'hpo.owl'),
'id': 'HPO',
'code_system_id': 'HPO',
'native_uri_stems': ['http://purl.obolibrary.org/obo/HP_'],
},
'rxnorm': {
'url': 'https://data.bioontology.org/'
'ontologies/RXNORM/submissions/23/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb',
'download_url': 'https://data.bioontology.org/'
'ontologies/RXNORM/submissions/23/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb',
'code_system_url': 'http://purl.bioontology.org/ontology/RXNORM',
'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'RXNORM.ttl'),
'id': 'rxnorm',
'code_system_id': 'rxnorm',
'native_uri_stems': ['http://purl.bioontology.org/ontology/RXNORM/'],
},
'sequence-ontology': {
'url': 'https://data.bioontology.org/'
'ontologies/SO/submissions/304/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb',
'download_url': 'https://data.bioontology.org/'
'ontologies/SO/submissions/304/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb',
'code_system_url': 'http://purl.bioontology.org/ontology/SO',
'input_path': os.path.join(OWL_ON_FHIR_CONTENT_REPO_PATH, 'input', 'so.owl'),
'id': 'sequence-ontology',
'code_system_id': 'sequence-ontology',
'native_uri_stems': ['http://purl.obolibrary.org/obo/SO_'],
},
}
})


# Functions
Expand Down Expand Up @@ -143,7 +155,7 @@ def owl_to_semsql(inpath: str, use_cache=False) -> str:
return outpath


def owl_to_obograph(inpath: str, use_cache=False) -> str:
def owl_to_obograph(inpath: str, native_uri_stems: List[str] = None, use_cache=False) -> str:
"""Convert OWL to Obograph
# todo: TTL and RDF also supported? not just OWL?"""
# Vars
Expand All @@ -162,67 +174,27 @@ def owl_to_obograph(inpath: str, use_cache=False) -> str:
# graph = parse_results.graph_document.graphs[0]
_run_shell_command(command)

# todo: might want to add this patch back and open up an issue, because nico said in issue below shouldn't happen
# - issue would be in OAK, regarding the 'cooked' error
# Patch missing roots / etc issue (until resolved: https://github.com/ontodev/robot/issues/1082)
# ! - Deactivated this because I was getting an error about the very same IDs that Chris R was asking for
# Try uploading and see if it works.
#
# - This appears to be mostly a problem in FHIR (and maybe just Obographs) if subClassOf or variation missing, but
# not 100% sure
#
# missing_nodes_from_important_edge_preds = [
# 'is_a',
# 'http://purl.bioontology.org/ontology/RXNORM/isa',
# 'rdfs:subClassOf',
# 'http://www.w3.org/2000/01/rdf-schema#subClassOf'
# ]
# with open(outpath, 'r') as f:
# data = json.load(f)
# nodes = data['graphs'][0]['nodes']
# node_ids = set([node['id'] for node in nodes])
# edges = data['graphs'][0]['edges']
# edges = [x for x in edges if x['pred'] in missing_nodes_from_important_edge_preds]
# edge_subs = set([edge['sub'] for edge in edges])
# edge_objs = set([edge['obj'] for edge in edges])
# edge_ids = edge_subs.union(edge_objs)
# missing = set([x for x in edge_ids if x not in node_ids])

# Edge case exclusions
# - This was causing the following error in OAK (I have not made a GH issue):
# - This example was from Mondo
# cooked_entry = Node(id="JsonObj(id='http://www.geneontology.org/formats/oboInOwl#Subset')", ...
# if cooked_entry[key_name] != key:
# > raise ...
# E ValueError: Slot: nodes - attribute id value (JsonObj(
# id='http://www.geneontology.org/formats/oboInOwl#Subset'))
# does not match key (http://www.geneontology.org/formats/oboInOwl#Subset)
#
# Method A: Remove cases
# id_exclusions = [
# 'http://www.geneontology.org/formats/oboInOwl#Subset'
# ]
# uri_stem_exclusions = [
# 'http://purl.obolibrary.org/obo/CARO_'
# ]
# for case in id_exclusions:
# if case in missing:
# missing.remove(case)
# missing2 = []
# for node_id in missing:
# if not any([node_id.startswith(x) for x in uri_stem_exclusions]):
# missing2.append(node_id)
#
# Method B: Keep only dominant IDs
# - Opted not to do

# if missing2:
# print(f'INFO: The following nodes were found in Obographs edges, but not nodes. Adding missing declarations: '
# f'{missing}')
# for node_id in missing:
# nodes.append({'id': node_id})
# with open(outpath, 'w') as f:
# json.dump(data, f)
if native_uri_stems:
with open(outpath, 'r') as f:
data = json.load(f)
nodes = data['graphs'][0]['nodes']
node_ids = set([node['id'] for node in nodes])
edges = data['graphs'][0]['edges']
# edges = [x for x in edges if x['pred'] in missing_nodes_from_important_edge_preds]
edge_subs = set([edge['sub'] for edge in edges])
edge_objs = set([edge['obj'] for edge in edges])
edge_ids = edge_subs.union(edge_objs)
missing = set([x for x in edge_ids if x not in node_ids]) # all missing
missing = [x for x in missing if any([x.startswith(y) for y in native_uri_stems])] # filter

if missing:
print(f'INFO: The following nodes were found in Obographs edges, but not nodes. Adding missing '
f'declarations: {missing}')
for node_id in missing:
nodes.append({'id': node_id})
with open(outpath, 'w') as f:
json.dump(data, f)

return outpath

Expand All @@ -231,18 +203,43 @@ def owl_to_obograph(inpath: str, use_cache=False) -> str:
# - https://github.com/linkml/linkml/issues/1156
# - https://github.com/ontodev/robot/issues/1079
# - https://github.com/geneontology/obographs/issues/89
def obograph_to_fhir(inpath: str, out_dir: str, out_filename: str = None, include_all_predicates=False) -> str:
def obograph_to_fhir(
inpath: str, out_dir: str, out_filename: str = None, code_system_id: str = None, code_system_url: str = None,
include_all_predicates=False, native_uri_stems: List[str] = None, dev_oak_path: str = None,
dev_oak_interpreter_path: str = None
) -> str:
"""Convert Obograph to FHIR"""
converter = OboGraphToFHIRConverter()
converter.curie_converter = curies.Converter.from_prefix_map(get_default_prefix_map())
gd: GraphDocument = json_loader.load(inpath, target_class=GraphDocument)
out_path = os.path.join(out_dir, out_filename)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
converter.dump(gd, out_path, include_all_predicates=include_all_predicates)
local_dev_exists: bool = (os.path.exists(dev_oak_path) if dev_oak_path else False) and (
os.path.exists(dev_oak_interpreter_path) if dev_oak_interpreter_path else False)
native_uri_stems_str = '"' + ','.join(native_uri_stems) + '"' if native_uri_stems else None
if dev_oak_path and local_dev_exists: # Params last updated: 2023/01/15
dev_oak_cli_path = os.path.join(dev_oak_path, 'src', 'oaklib', 'cli.py')
command_str = \
f'{dev_oak_interpreter_path} {dev_oak_cli_path} -i {inpath} dump -o {out_path} -O fhirjson' + \
' --include-all-predicates' if include_all_predicates else '' + \
f' --code-system-id {code_system_id}' if code_system_id else '' + \
f' --code-system-url {code_system_url}' if code_system_url else '' + \
f' --native-uri-stems {native_uri_stems_str}' if native_uri_stems_str else ''
_run_shell_command(command_str)

elif dev_oak_path and not local_dev_exists:
print('Warning: Tried to use local dev OAK, but one of paths does not exist. Using installed OAK release.')
else:
converter = OboGraphToFHIRConverter()
converter.curie_converter = curies.Converter.from_prefix_map(get_default_prefix_map())
gd: GraphDocument = json_loader.load(inpath, target_class=GraphDocument)
converter.dump(gd, out_path, include_all_predicates=include_all_predicates)
# todo: update w/ these params when released
# converter.dump(
# gd, out_path, code_system_id='', code_system_url='', include_all_predicates=include_all_predicates,
# native_uri_stems=native_uri_stems, use_curies_native_concepts=False, use_curies_foreign_concepts=True)
return out_path


# todo: add local dev oak params to this and abstract a general 'run oak' func for this and obographs_to_fhir
def semsql_to_fhir(inpath: str, out_dir: str, out_filename: str = None, include_all_predicates=False) -> str:
"""Convert SemanticSQL sqlite DB to FHIR"""
# todo: any way to do this using Python API?
Expand All @@ -260,43 +257,46 @@ def semsql_to_fhir(inpath: str, out_dir: str, out_filename: str = None, include_
def owl_to_fhir(
input_path_or_url: str, out_dir: str = OUTDIR, out_filename: str = None, include_all_predicates=False,
retain_intermediaries=False, intermediary_type=['obographs', 'semsql'][0], use_cached_intermediaries=False,
intermediary_outdir: str = None, convert_intermediaries_only=False
intermediary_outdir: str = None, convert_intermediaries_only=False, native_uri_stems: List[str] = None,
code_system_id: str = None, code_system_url: str = None, dev_oak_path: str = None,
dev_oak_interpreter_path: str = None
) -> str:
"""Run conversion"""
# Download if necessary & determine outpaths
# todo: this section w/ urls, names, and IDs has too many possible branches and is error prone. simplify by
# updating CLI params to require url or path separately, and maybe require codesystem id
intermediary_outdir = intermediary_outdir if intermediary_outdir else out_dir
input_path = input_path_or_url
url = None
maybe_url = urlparse(input_path_or_url)
if maybe_url.scheme and maybe_url.netloc:
url = input_path_or_url
if not out_filename and not url:
_id = '.'.join(os.path.basename(input_path).split('.')[0:-1]) # removes file extension
out_filename = f'CodeSystem-{_id}.json'
elif not out_filename and url:
out_filename = 'CodeSystem.json'
if url:
input_path = os.path.join(CACHE_DIR, out_filename.replace('.json', '.owl'))
download(url, input_path)
if not out_filename:
if not code_system_id:
code_system_id = '.'.join(os.path.basename(input_path).split('.')[0:-1]) # removes file extension
out_filename = f'CodeSystem-{code_system_id}.json'
if not code_system_id and out_filename and out_filename.startswith('CodeSystem-'):
code_system_id = out_filename.split('-')[1].split('.')[0]

# Preprocessing: Special cases
if 'rxnorm' in input_path.lower() or 'rxnorm' in out_filename.lower():
input_path = _preprocess_rxnorm(input_path)

# Convert
if intermediary_type == 'obographs' or input_path.endswith('.ttl'): # semsql only supports .owl
intermediary_path = owl_to_obograph(input_path, use_cached_intermediaries)
intermediary_path = owl_to_obograph(input_path, native_uri_stems, use_cached_intermediaries)
obograph_to_fhir(
inpath=intermediary_path,
out_dir=intermediary_outdir,
out_filename=out_filename,
include_all_predicates=include_all_predicates)
inpath=intermediary_path, out_dir=intermediary_outdir, out_filename=out_filename,
code_system_id=code_system_id, code_system_url=code_system_url, native_uri_stems=native_uri_stems,
include_all_predicates=include_all_predicates, dev_oak_path=dev_oak_path,
dev_oak_interpreter_path=dev_oak_interpreter_path)
else: # semsql
intermediary_path = owl_to_semsql(input_path, use_cached_intermediaries)
semsql_to_fhir(
inpath=intermediary_path,
out_dir=intermediary_outdir,
out_filename=out_filename,
inpath=intermediary_path, out_dir=intermediary_outdir, out_filename=out_filename,
include_all_predicates=include_all_predicates)
if convert_intermediaries_only:
return intermediary_path
Expand All @@ -319,7 +319,8 @@ def owl_to_fhir(
def _run_favorites(
use_cached_intermediaries: bool = None, retain_intermediaries: bool = None, include_all_predicates: bool = None,
intermediary_type: str = None, out_dir: str = None, intermediary_outdir: str = None,
convert_intermediaries_only: bool = None, favorites: Dict = FAVORITE_ONTOLOGIES
convert_intermediaries_only: bool = None, dev_oak_path: str = None, dev_oak_interpreter_path: str = None,
favorites: Dict = FAVORITE_ONTOLOGIES
):
"""Convert favorite ontologies"""
kwargs = {k: v for k, v in locals().items() if v is not None and not k.startswith('__') and k != 'favorites'}
Expand All @@ -328,15 +329,16 @@ def _run_favorites(
n = len(favorites)
i = 0
for d in favorites.values():
print('Converting {} of {}: {}'.format(i, n, d['id']))
i += 1
print('Converting {} of {}: {}'.format(i, n, d['code_system_id']))
try:
owl_to_fhir(
out_filename=f'CodeSystem-{d["id"]}.json',
input_path_or_url=d['input_path'] if d['input_path'] else d['url'], **kwargs)
out_filename=f'CodeSystem-{d["code_system_id"]}.json',
input_path_or_url=d['input_path'] if d['input_path'] else d['download_url'], **kwargs)
successes.append(d['id'])
except Exception as e:
fails.append(d['id'])
print('Failed to convert {}: \n{}'.format(d['id'], e))
fails.append(d['code_system_id'])
print('Failed to convert {}: \n{}'.format(d['code_system_id'], e))
print('SUMMARY')
print('Successes: ' + str(successes))
print('Failures: ' + str(fails))
Expand All @@ -351,9 +353,26 @@ def cli():
'-o', '--out-dir', required=False, default=OUTDIR, help='The directory where results should be saved.')
parser.add_argument(
'-n', '--out-filename', required=False, help='Filename for the primary file converted, e.g. CodeSystem.')
parser.add_argument(
'-s', '--code-system-id', required=False, default=False,
help="For `fhirjson` only. The code system ID to use for identification on the server uploaded to. "
"See: https://hl7.org/fhir/resource-definitions.html#Resource.id",)
parser.add_argument(
'-S', '--code-system-url', required=False, default=False,
help="For `fhirjson` only. Canonical URL for the code system. "
"See: https://hl7.org/fhir/codesystem-definitions.html#CodeSystem.url",)
parser.add_argument(
'-p', '--include-all-predicates', action='store_true', required=False, default=False,
help='Include all predicates in CodeSystem.property and CodeSystem.concept.property, or just is_a/parent?')
parser.add_argument(
'-u', '--native-uri-stems', required=False, nargs='+',
help='A comma-separated list of URI stems that will be used to determine whether a concept is native to '
'the CodeSystem. For example, for OMIM, the following URI stems are native: '
'https://omim.org/entry/,https://omim.org/phenotypicSeries/PS"'
'As of 2023-01-15, there is still a bug in the Obographs spec and/or `robot` where certain nodes are not'
' being converted. This converter adds back the nodes, but to know which ones belong to the CodeSystem '
'itself and are not foreign concepts, this parameter is necessary. OAK also makes use of this parameter. '
'See also: https://github.com/geneontology/obographs/issues/90')
parser.add_argument(
'-t', '--intermediary-type', choices=INTERMEDIARY_TYPES, default='obographs', required=False,
help='Which type of intermediary to use? First, we convert OWL to that intermediary format, and then we '
Expand All @@ -367,14 +386,24 @@ def cli():
parser.add_argument(
'-I', '--convert-intermediaries-only', action='store_true', default=False, required=False,
help='Convert intermediaries only?')
parser.add_argument(
'-d', '--dev-oak-path', default=False, required=False,
help='If you want to use a local development version of OAK, specify the path to the OAK directory here. '
'Must be used with --dev-oak-interpreter-path.')
parser.add_argument(
'-D', '--dev-oak-interpreter-path', default=False, required=False,
help='If you want to use a local development version of OAK, specify the path to the Python interpreter where '
'its dependencies are installed (i.e. its virtual environment). Must be used with --dev-oak-path.')
parser.add_argument(
'-f', '--favorites', action='store_true', default=False, required=False,
help='If present, will run all favorite ontologies found in `FAVORITE_ONTOLOGIES`. If using this option, the '
'other CLI flags are not relevant. Instead, edit the following config: `FAVORITE_DEFAULTS`.')

d: Dict = vars(parser.parse_args())
if d['favorites']:
_run_favorites(**{**FAVORITE_DEFAULTS, **{'favorites': FAVORITE_ONTOLOGIES}})
_run_favorites(
dev_oak_path=d['dev_oak_path'], dev_oak_interpreter_path=d['dev_oak_interpreter_path'],
**{**FAVORITE_DEFAULTS, **{'favorites': FAVORITE_ONTOLOGIES}})
del d['favorites']
owl_to_fhir(**d)

Expand Down

0 comments on commit 2f46a1f

Please sign in to comment.