From cb75bdc36e9032902c9c6398d4774a332bf4b4fc Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Thu, 13 Aug 2020 14:45:50 +0100 Subject: [PATCH 1/3] various changes (mainly mapping extract) - Dockerfile: changed the order of the `RUN apk` and `ADD oxo-loader` commands to make it faster to compile the image at DEV time. - OlsDatasetExtractor: purely cosmetic, non-functional change. - OxoCsvBuilder: fix utf8 encoding of strings in python after python2 to python3 migration: in python3, strings are utf-8 by default (docs included in comment) - OlsMappingExtractor: - make it possible to SKIP efo loading with a config option. `Warning:` test what happens when the the config is not set at all! - Do the json.loads with the more state of the art syntax - Most important: `for x in range(0, size, rows):` Please think about this for a minute before you merge this; this is the most major change, but I am not 100% sure its exactly right. - --- oxo-loader/Dockerfile | 4 ++-- oxo-loader/OlsDatasetExtractor.py | 3 +-- oxo-loader/OlsMappingExtractor.py | 9 +++++++++ oxo-loader/OxoCsvBuilder.py | 8 +++----- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/oxo-loader/Dockerfile b/oxo-loader/Dockerfile index a58953d..b9fef14 100644 --- a/oxo-loader/Dockerfile +++ b/oxo-loader/Dockerfile @@ -1,9 +1,9 @@ FROM python:3-alpine -ADD oxo-loader /opt/oxo-loader - RUN apk add --no-cache bash mariadb-dev build-base #RUN apk --update add mysql mysql-client + +ADD oxo-loader /opt/oxo-loader RUN cd /opt/oxo-loader && pip install -r requirements.txt CMD bash diff --git a/oxo-loader/OlsDatasetExtractor.py b/oxo-loader/OlsDatasetExtractor.py index 95d0dc4..447344c 100755 --- a/oxo-loader/OlsDatasetExtractor.py +++ b/oxo-loader/OlsDatasetExtractor.py @@ -60,6 +60,7 @@ for ontology in ontologies: namespace = ontology["config"]["namespace"] version = ontology["updated"] + prefPrefix = ontology["config"]["preferredPrefix"] altPrefixes = [namespace] if namespace == 'ordo': @@ -72,8 +73,6 @@ elif namespace == "ncit": prefPrefix = "NCIT" altPrefixes = [namespace, "NCI_Thesaurus", "NCI", "ncithesaurus", "NCI2009_04D"] - else: - prefPrefix = ontology["config"]["preferredPrefix"] title = ontology["config"]["title"] desc = ontology["config"]["description"] diff --git a/oxo-loader/OlsMappingExtractor.py b/oxo-loader/OlsMappingExtractor.py index 8f55519..31834c3 100755 --- a/oxo-loader/OlsMappingExtractor.py +++ b/oxo-loader/OlsMappingExtractor.py @@ -139,6 +139,11 @@ def processSolrDocs(url): fromPrefix = OXO.getPrefixFromCui(fromOboId) fromId = OXO.getIdFromCui(fromOboId) + # Terrible hack for this case, why does it not work from the base_uri? + if fromIri.startswith("https://purl.ihccglobal.org/"): + iri = fromIri.replace("https://purl.ihccglobal.org/", "") + fromPrefix, fromId = iri.split("_", 1) + if not fromPrefix and not fromId: fromPrefix = OXO.getPrefixFromCui(fromShortForm) fromId = OXO.getIdFromCui(fromShortForm) @@ -178,6 +183,10 @@ def processSolrDocs(url): toPrefix = OXO.getPrefixFromCui(xref) toId = OXO.getIdFromCui(xref) + if xref.startswith("https://purl.ihccglobal.org/"): + iri = xref.replace("https://purl.ihccglobal.org/", "") + toPrefix, toId = iri.split("_", 1) + if not toPrefix or not toId: print("Can't get prefix or id for " + xref) continue diff --git a/oxo-loader/OxoCsvBuilder.py b/oxo-loader/OxoCsvBuilder.py index 09f9f07..ac5d992 100644 --- a/oxo-loader/OxoCsvBuilder.py +++ b/oxo-loader/OxoCsvBuilder.py @@ -47,11 +47,9 @@ def exportTermsToCsv(self, file, terms): label = None uri = None - try: - if term["label"] is not None: - label = term["label"].encode('utf-8', errors="ignore") - except: - pass + if term["label"] is not None: + #https://docs.python.org/release/3.0.1/whatsnew/3.0.html#text-vs-data-instead-of-unicode-vs-8-bit + label = term["label"] if term["uri"] is not None: uri = term["uri"] From 9e2dcc839ae82ea54fa295888e29020f33b49bbb Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Thu, 13 Aug 2020 14:48:09 +0100 Subject: [PATCH 2/3] Update OlsMappingExtractor.py --- oxo-loader/OlsMappingExtractor.py | 58 +++++++++++++++---------------- 1 file changed, 28 insertions(+), 30 deletions(-) mode change 100755 => 100644 oxo-loader/OlsMappingExtractor.py diff --git a/oxo-loader/OlsMappingExtractor.py b/oxo-loader/OlsMappingExtractor.py old mode 100755 new mode 100644 index 31834c3..bd200a1 --- a/oxo-loader/OlsMappingExtractor.py +++ b/oxo-loader/OlsMappingExtractor.py @@ -31,6 +31,7 @@ OXO.olsurl=config.get("Basics","olsurl") solrBaseUrl=config.get("Basics","olsSolrBaseUrl") +skipEfo=config.get("Basics","skipEfo") exportFileTerms= config.get("Paths","exportFileTerms") if options.terms: @@ -84,9 +85,10 @@ prefixToPreferred[prefix] = prefix for altPrefix in data["alternatePrefix"]: prefixToPreferred[altPrefix] = prefix - if "idorgNamespace" in data and data["idorgNamespace"] != '': + if "idorgNamespace" in data and data["idorgNamespace"] != '': idorgNamespace[altPrefix.lower()] = data["idorgNamespace"] idorgNamespace[prefix.lower()] = data["idorgNamespace"] + print("Reading datasources from OxO done") # these are the annotation properties where we look for xrefs @@ -98,14 +100,15 @@ # find all the EFO xref annotation propertied # note EFO does xrefs in a different way to all the other OBO ontologies so # give it special consideration -response = urllib.request.urlopen(getEfoAnnotationsUrl) -print(getEfoAnnotationsUrl) -print(response) -cr = csv.reader(response.read().decode('utf-8')) -for row in cr: - for p in row: - if 'definition_citation' in p: - knownAnnotations.append(p) +if not skipEfo: + response = urllib.request.urlopen(getEfoAnnotationsUrl) + print(getEfoAnnotationsUrl) + print(response) + cr = csv.reader(response.read().decode('utf-8')) + for row in cr: + for p in row: + if 'definition_citation' in p: + knownAnnotations.append(p) unknownSource = {} @@ -116,16 +119,18 @@ # main function that gets crawls the OLS Solr documents for xrefs # We use the Solr endpoint directly instead of the OLS API as we can restrict the query # to only terms that have an xref. In the future we may add this functionality to the OLS API + + def processSolrDocs(url): rows = solrChunks initUrl = url + "&start=0&rows=" + str(rows) - reply = urllib.request.urlopen(initUrl) - anwser = json.load(reply) + with urllib.request.urlopen(initUrl) as reply: + json_terms = json.loads(reply.read().decode()) - size = anwser["response"]["numFound"] + size = json_terms["response"]["numFound"] - for x in range(rows, size, rows): - for docs in anwser["response"]["docs"]: + for x in range(0, size, rows): + for docs in json_terms["response"]["docs"]: fromPrefix = None fromId = None @@ -139,11 +144,6 @@ def processSolrDocs(url): fromPrefix = OXO.getPrefixFromCui(fromOboId) fromId = OXO.getIdFromCui(fromOboId) - # Terrible hack for this case, why does it not work from the base_uri? - if fromIri.startswith("https://purl.ihccglobal.org/"): - iri = fromIri.replace("https://purl.ihccglobal.org/", "") - fromPrefix, fromId = iri.split("_", 1) - if not fromPrefix and not fromId: fromPrefix = OXO.getPrefixFromCui(fromShortForm) fromId = OXO.getIdFromCui(fromShortForm) @@ -182,11 +182,7 @@ def processSolrDocs(url): if ":" in xref or "_" in xref: toPrefix = OXO.getPrefixFromCui(xref) toId = OXO.getIdFromCui(xref) - - if xref.startswith("https://purl.ihccglobal.org/"): - iri = xref.replace("https://purl.ihccglobal.org/", "") - toPrefix, toId = iri.split("_", 1) - + if not toPrefix or not toId: print("Can't get prefix or id for " + xref) continue @@ -209,9 +205,10 @@ def processSolrDocs(url): "id": toId, "curie": toCurie, "uri": None, - "label":None + "label": None } + if fromCurie == toCurie: continue @@ -252,14 +249,16 @@ def processSolrDocs(url): print(str(x)) initUrl = url + "&start=" + str(x) + "&rows=" + str(rows) - reply = urllib.request.urlopen(initUrl) - anwser = json.load(reply) + with urllib.request.urlopen(initUrl) as reply: + json_terms = json.loads(reply.read().decode()) # do the query to get docs from solr and process -processSolrDocs(efoSolrQueryUrl) -print("Done processing EFO, starting to query OLS") +if not skipEfo: + processSolrDocs(efoSolrQueryUrl) + print("Done processing EFO, starting to query OLS") + processSolrDocs(olsDbxrefSolrQuery) print("Done processing OLS") @@ -290,7 +289,6 @@ def processSolrDocs(url): import OxoCsvBuilder builder = OxoCsvBuilder.Builder() - builder.exportTermsToCsv(exportFileTerms, terms) builder.exportMappingsToCsv(exportFileMappings, postMappings, prefixToDatasource) From fd76c1c7b0c39c660cd3fa8113a46ebc86b2c36b Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Thu, 13 Aug 2020 15:00:21 +0100 Subject: [PATCH 3/3] Update OlsMappingExtractor.py --- oxo-loader/OlsMappingExtractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oxo-loader/OlsMappingExtractor.py b/oxo-loader/OlsMappingExtractor.py index bd200a1..fb98850 100644 --- a/oxo-loader/OlsMappingExtractor.py +++ b/oxo-loader/OlsMappingExtractor.py @@ -182,7 +182,7 @@ def processSolrDocs(url): if ":" in xref or "_" in xref: toPrefix = OXO.getPrefixFromCui(xref) toId = OXO.getIdFromCui(xref) - + if not toPrefix or not toId: print("Can't get prefix or id for " + xref) continue @@ -282,7 +282,7 @@ def processSolrDocs(url): print("Finished, here are all the unknown sources") for key, value in unknownSource.items() : # see if we can match prefix to db - print(key.encode('utf-8', 'ignore')) + print(key) print("Generating CSV files for neo loading...")