Merge pull request #36 from IHCC-cohorts/master

Various changes, mostly OXO mapping pipeline
EBISPOT · Aug 13, 2020 · d782373 · d782373
2 parents 86b805c + fd76c1c
commit d782373
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 30 deletions.
diff --git a/oxo-loader/Dockerfile b/oxo-loader/Dockerfile
@@ -1,9 +1,9 @@
 FROM python:3-alpine
 
-ADD oxo-loader /opt/oxo-loader
-
 RUN apk add --no-cache bash mariadb-dev build-base
 #RUN apk --update add mysql mysql-client
+
+ADD oxo-loader /opt/oxo-loader
 RUN cd /opt/oxo-loader && pip install -r requirements.txt
 RUN chmod +x /opt/oxo-loader/load_all.sh
 

diff --git a/oxo-loader/OlsDatasetExtractor.py b/oxo-loader/OlsDatasetExtractor.py
@@ -60,6 +60,7 @@
 for ontology in ontologies:
     namespace = ontology["config"]["namespace"]
     version = ontology["updated"]
+    prefPrefix = ontology["config"]["preferredPrefix"]
 
     altPrefixes = [namespace]
     if namespace == 'ordo':
@@ -72,8 +73,6 @@
     elif namespace == "ncit":
         prefPrefix = "NCIT"
         altPrefixes = [namespace, "NCI_Thesaurus", "NCI", "ncithesaurus", "NCI2009_04D"]
-    else:
-        prefPrefix = ontology["config"]["preferredPrefix"]
 
     title = ontology["config"]["title"]
     desc = ontology["config"]["description"]

diff --git a/oxo-loader/OlsMappingExtractor.py b/oxo-loader/OlsMappingExtractor.py
@@ -31,6 +31,7 @@
 OXO.olsurl=config.get("Basics","olsurl")
 
 solrBaseUrl=config.get("Basics","olsSolrBaseUrl")
+skipEfo=config.get("Basics","skipEfo")
 
 exportFileTerms= config.get("Paths","exportFileTerms")
 if options.terms:
@@ -84,9 +85,10 @@
     prefixToPreferred[prefix] = prefix
     for altPrefix in data["alternatePrefix"]:
         prefixToPreferred[altPrefix] = prefix
-        if "idorgNamespace" in data and  data["idorgNamespace"] != '':
+        if "idorgNamespace" in data and data["idorgNamespace"] != '':
             idorgNamespace[altPrefix.lower()] = data["idorgNamespace"]
             idorgNamespace[prefix.lower()] = data["idorgNamespace"]
+
 print("Reading datasources from OxO done")
 
 # these are the annotation properties where we look for xrefs
@@ -98,14 +100,15 @@
 # find all the EFO xref annotation propertied
 # note EFO does xrefs in a different way to all the other OBO ontologies so
 # give it special consideration
-response = urllib.request.urlopen(getEfoAnnotationsUrl)
-print(getEfoAnnotationsUrl)
-print(response)
-cr = csv.reader(response.read().decode('utf-8'))
-for row in cr:
-    for p in row:
-        if 'definition_citation' in p:
-            knownAnnotations.append(p)
+if not skipEfo:
+    response = urllib.request.urlopen(getEfoAnnotationsUrl)
+    print(getEfoAnnotationsUrl)
+    print(response)
+    cr = csv.reader(response.read().decode('utf-8'))
+    for row in cr:
+        for p in row:
+            if 'definition_citation' in p:
+                knownAnnotations.append(p)
 
 unknownSource = {}
 
@@ -116,16 +119,18 @@
 # main function that gets crawls the OLS Solr documents for xrefs
 # We use the Solr endpoint directly instead of the OLS API as we can restrict the query
 # to only terms that have an xref. In the future we may add this functionality to the OLS API
+
+
 def processSolrDocs(url):
     rows = solrChunks
     initUrl = url + "&start=0&rows=" + str(rows)
-    reply = urllib.request.urlopen(initUrl)
-    anwser = json.load(reply)
+    with urllib.request.urlopen(initUrl) as reply:
+        json_terms = json.loads(reply.read().decode())
 
-    size = anwser["response"]["numFound"]
+    size = json_terms["response"]["numFound"]
 
-    for x in range(rows, size, rows):
-        for docs in anwser["response"]["docs"]:
+    for x in range(0, size, rows):
+        for docs in json_terms["response"]["docs"]:
             fromPrefix = None
             fromId = None
 
@@ -200,9 +205,10 @@ def processSolrDocs(url):
                                     "id": toId,
                                     "curie": toCurie,
                                     "uri": None,
-                                    "label":None
+                                    "label": None
                                 }
 
+
                             if fromCurie == toCurie:
                                 continue
 
@@ -243,14 +249,16 @@ def processSolrDocs(url):
 
         print(str(x))
         initUrl = url + "&start=" + str(x) + "&rows=" + str(rows)
-        reply = urllib.request.urlopen(initUrl)
-        anwser = json.load(reply)
+        with urllib.request.urlopen(initUrl) as reply:
+            json_terms = json.loads(reply.read().decode())
 
 
 # do the query to get docs from solr and process
 
-processSolrDocs(efoSolrQueryUrl)
-print("Done processing EFO, starting to query OLS")
+if not skipEfo:
+    processSolrDocs(efoSolrQueryUrl)
+    print("Done processing EFO, starting to query OLS")
+
 processSolrDocs(olsDbxrefSolrQuery)
 print("Done processing OLS")
 
@@ -274,14 +282,13 @@ def processSolrDocs(url):
 print("Finished, here are all the unknown sources")
 for key, value in unknownSource.items() :
     # see if we can match prefix to db
-    print(key.encode('utf-8', 'ignore'))
+    print(key)
 
 print("Generating CSV files for neo loading...")
 
 
 import OxoCsvBuilder
 builder = OxoCsvBuilder.Builder()
-
 builder.exportTermsToCsv(exportFileTerms, terms)
 builder.exportMappingsToCsv(exportFileMappings, postMappings, prefixToDatasource)
 

diff --git a/oxo-loader/OxoCsvBuilder.py b/oxo-loader/OxoCsvBuilder.py
@@ -47,11 +47,9 @@ def exportTermsToCsv(self, file, terms):
                 label = None
                 uri = None
 
-                try:
-                    if term["label"] is not None:
-                        label = term["label"].encode('utf-8', errors="ignore")
-                except:
-                    pass
+                if term["label"] is not None:
+                    #https://docs.python.org/release/3.0.1/whatsnew/3.0.html#text-vs-data-instead-of-unicode-vs-8-bit
+                    label = term["label"]
 
                 if term["uri"] is not None:
                     uri = term["uri"]