diff --git a/oxo-loader/Dockerfile b/oxo-loader/Dockerfile index d34f20c..aa21b6e 100644 --- a/oxo-loader/Dockerfile +++ b/oxo-loader/Dockerfile @@ -1,9 +1,9 @@ FROM python:3-alpine -ADD oxo-loader /opt/oxo-loader - RUN apk add --no-cache bash mariadb-dev build-base #RUN apk --update add mysql mysql-client + +ADD oxo-loader /opt/oxo-loader RUN cd /opt/oxo-loader && pip install -r requirements.txt RUN chmod +x /opt/oxo-loader/load_all.sh diff --git a/oxo-loader/OlsDatasetExtractor.py b/oxo-loader/OlsDatasetExtractor.py index 95d0dc4..447344c 100755 --- a/oxo-loader/OlsDatasetExtractor.py +++ b/oxo-loader/OlsDatasetExtractor.py @@ -60,6 +60,7 @@ for ontology in ontologies: namespace = ontology["config"]["namespace"] version = ontology["updated"] + prefPrefix = ontology["config"]["preferredPrefix"] altPrefixes = [namespace] if namespace == 'ordo': @@ -72,8 +73,6 @@ elif namespace == "ncit": prefPrefix = "NCIT" altPrefixes = [namespace, "NCI_Thesaurus", "NCI", "ncithesaurus", "NCI2009_04D"] - else: - prefPrefix = ontology["config"]["preferredPrefix"] title = ontology["config"]["title"] desc = ontology["config"]["description"] diff --git a/oxo-loader/OlsMappingExtractor.py b/oxo-loader/OlsMappingExtractor.py old mode 100755 new mode 100644 index 8f55519..fb98850 --- a/oxo-loader/OlsMappingExtractor.py +++ b/oxo-loader/OlsMappingExtractor.py @@ -31,6 +31,7 @@ OXO.olsurl=config.get("Basics","olsurl") solrBaseUrl=config.get("Basics","olsSolrBaseUrl") +skipEfo=config.get("Basics","skipEfo") exportFileTerms= config.get("Paths","exportFileTerms") if options.terms: @@ -84,9 +85,10 @@ prefixToPreferred[prefix] = prefix for altPrefix in data["alternatePrefix"]: prefixToPreferred[altPrefix] = prefix - if "idorgNamespace" in data and data["idorgNamespace"] != '': + if "idorgNamespace" in data and data["idorgNamespace"] != '': idorgNamespace[altPrefix.lower()] = data["idorgNamespace"] idorgNamespace[prefix.lower()] = data["idorgNamespace"] + print("Reading datasources from OxO done") # these are the annotation properties where we look for xrefs @@ -98,14 +100,15 @@ # find all the EFO xref annotation propertied # note EFO does xrefs in a different way to all the other OBO ontologies so # give it special consideration -response = urllib.request.urlopen(getEfoAnnotationsUrl) -print(getEfoAnnotationsUrl) -print(response) -cr = csv.reader(response.read().decode('utf-8')) -for row in cr: - for p in row: - if 'definition_citation' in p: - knownAnnotations.append(p) +if not skipEfo: + response = urllib.request.urlopen(getEfoAnnotationsUrl) + print(getEfoAnnotationsUrl) + print(response) + cr = csv.reader(response.read().decode('utf-8')) + for row in cr: + for p in row: + if 'definition_citation' in p: + knownAnnotations.append(p) unknownSource = {} @@ -116,16 +119,18 @@ # main function that gets crawls the OLS Solr documents for xrefs # We use the Solr endpoint directly instead of the OLS API as we can restrict the query # to only terms that have an xref. In the future we may add this functionality to the OLS API + + def processSolrDocs(url): rows = solrChunks initUrl = url + "&start=0&rows=" + str(rows) - reply = urllib.request.urlopen(initUrl) - anwser = json.load(reply) + with urllib.request.urlopen(initUrl) as reply: + json_terms = json.loads(reply.read().decode()) - size = anwser["response"]["numFound"] + size = json_terms["response"]["numFound"] - for x in range(rows, size, rows): - for docs in anwser["response"]["docs"]: + for x in range(0, size, rows): + for docs in json_terms["response"]["docs"]: fromPrefix = None fromId = None @@ -200,9 +205,10 @@ def processSolrDocs(url): "id": toId, "curie": toCurie, "uri": None, - "label":None + "label": None } + if fromCurie == toCurie: continue @@ -243,14 +249,16 @@ def processSolrDocs(url): print(str(x)) initUrl = url + "&start=" + str(x) + "&rows=" + str(rows) - reply = urllib.request.urlopen(initUrl) - anwser = json.load(reply) + with urllib.request.urlopen(initUrl) as reply: + json_terms = json.loads(reply.read().decode()) # do the query to get docs from solr and process -processSolrDocs(efoSolrQueryUrl) -print("Done processing EFO, starting to query OLS") +if not skipEfo: + processSolrDocs(efoSolrQueryUrl) + print("Done processing EFO, starting to query OLS") + processSolrDocs(olsDbxrefSolrQuery) print("Done processing OLS") @@ -274,14 +282,13 @@ def processSolrDocs(url): print("Finished, here are all the unknown sources") for key, value in unknownSource.items() : # see if we can match prefix to db - print(key.encode('utf-8', 'ignore')) + print(key) print("Generating CSV files for neo loading...") import OxoCsvBuilder builder = OxoCsvBuilder.Builder() - builder.exportTermsToCsv(exportFileTerms, terms) builder.exportMappingsToCsv(exportFileMappings, postMappings, prefixToDatasource) diff --git a/oxo-loader/OxoCsvBuilder.py b/oxo-loader/OxoCsvBuilder.py index 09f9f07..ac5d992 100644 --- a/oxo-loader/OxoCsvBuilder.py +++ b/oxo-loader/OxoCsvBuilder.py @@ -47,11 +47,9 @@ def exportTermsToCsv(self, file, terms): label = None uri = None - try: - if term["label"] is not None: - label = term["label"].encode('utf-8', errors="ignore") - except: - pass + if term["label"] is not None: + #https://docs.python.org/release/3.0.1/whatsnew/3.0.html#text-vs-data-instead-of-unicode-vs-8-bit + label = term["label"] if term["uri"] is not None: uri = term["uri"]