Skip to content

Commit

Permalink
Merge pull request #36 from IHCC-cohorts/master
Browse files Browse the repository at this point in the history
Various changes, mostly OXO mapping pipeline
  • Loading branch information
jamesamcl authored Aug 13, 2020
2 parents 86b805c + fd76c1c commit d782373
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 30 deletions.
4 changes: 2 additions & 2 deletions oxo-loader/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
FROM python:3-alpine

ADD oxo-loader /opt/oxo-loader

RUN apk add --no-cache bash mariadb-dev build-base
#RUN apk --update add mysql mysql-client

ADD oxo-loader /opt/oxo-loader
RUN cd /opt/oxo-loader && pip install -r requirements.txt
RUN chmod +x /opt/oxo-loader/load_all.sh

Expand Down
3 changes: 1 addition & 2 deletions oxo-loader/OlsDatasetExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
for ontology in ontologies:
namespace = ontology["config"]["namespace"]
version = ontology["updated"]
prefPrefix = ontology["config"]["preferredPrefix"]

altPrefixes = [namespace]
if namespace == 'ordo':
Expand All @@ -72,8 +73,6 @@
elif namespace == "ncit":
prefPrefix = "NCIT"
altPrefixes = [namespace, "NCI_Thesaurus", "NCI", "ncithesaurus", "NCI2009_04D"]
else:
prefPrefix = ontology["config"]["preferredPrefix"]

title = ontology["config"]["title"]
desc = ontology["config"]["description"]
Expand Down
49 changes: 28 additions & 21 deletions oxo-loader/OlsMappingExtractor.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
OXO.olsurl=config.get("Basics","olsurl")

solrBaseUrl=config.get("Basics","olsSolrBaseUrl")
skipEfo=config.get("Basics","skipEfo")

exportFileTerms= config.get("Paths","exportFileTerms")
if options.terms:
Expand Down Expand Up @@ -84,9 +85,10 @@
prefixToPreferred[prefix] = prefix
for altPrefix in data["alternatePrefix"]:
prefixToPreferred[altPrefix] = prefix
if "idorgNamespace" in data and data["idorgNamespace"] != '':
if "idorgNamespace" in data and data["idorgNamespace"] != '':
idorgNamespace[altPrefix.lower()] = data["idorgNamespace"]
idorgNamespace[prefix.lower()] = data["idorgNamespace"]

print("Reading datasources from OxO done")

# these are the annotation properties where we look for xrefs
Expand All @@ -98,14 +100,15 @@
# find all the EFO xref annotation propertied
# note EFO does xrefs in a different way to all the other OBO ontologies so
# give it special consideration
response = urllib.request.urlopen(getEfoAnnotationsUrl)
print(getEfoAnnotationsUrl)
print(response)
cr = csv.reader(response.read().decode('utf-8'))
for row in cr:
for p in row:
if 'definition_citation' in p:
knownAnnotations.append(p)
if not skipEfo:
response = urllib.request.urlopen(getEfoAnnotationsUrl)
print(getEfoAnnotationsUrl)
print(response)
cr = csv.reader(response.read().decode('utf-8'))
for row in cr:
for p in row:
if 'definition_citation' in p:
knownAnnotations.append(p)

unknownSource = {}

Expand All @@ -116,16 +119,18 @@
# main function that gets crawls the OLS Solr documents for xrefs
# We use the Solr endpoint directly instead of the OLS API as we can restrict the query
# to only terms that have an xref. In the future we may add this functionality to the OLS API


def processSolrDocs(url):
rows = solrChunks
initUrl = url + "&start=0&rows=" + str(rows)
reply = urllib.request.urlopen(initUrl)
anwser = json.load(reply)
with urllib.request.urlopen(initUrl) as reply:
json_terms = json.loads(reply.read().decode())

size = anwser["response"]["numFound"]
size = json_terms["response"]["numFound"]

for x in range(rows, size, rows):
for docs in anwser["response"]["docs"]:
for x in range(0, size, rows):
for docs in json_terms["response"]["docs"]:
fromPrefix = None
fromId = None

Expand Down Expand Up @@ -200,9 +205,10 @@ def processSolrDocs(url):
"id": toId,
"curie": toCurie,
"uri": None,
"label":None
"label": None
}


if fromCurie == toCurie:
continue

Expand Down Expand Up @@ -243,14 +249,16 @@ def processSolrDocs(url):

print(str(x))
initUrl = url + "&start=" + str(x) + "&rows=" + str(rows)
reply = urllib.request.urlopen(initUrl)
anwser = json.load(reply)
with urllib.request.urlopen(initUrl) as reply:
json_terms = json.loads(reply.read().decode())


# do the query to get docs from solr and process

processSolrDocs(efoSolrQueryUrl)
print("Done processing EFO, starting to query OLS")
if not skipEfo:
processSolrDocs(efoSolrQueryUrl)
print("Done processing EFO, starting to query OLS")

processSolrDocs(olsDbxrefSolrQuery)
print("Done processing OLS")

Expand All @@ -274,14 +282,13 @@ def processSolrDocs(url):
print("Finished, here are all the unknown sources")
for key, value in unknownSource.items() :
# see if we can match prefix to db
print(key.encode('utf-8', 'ignore'))
print(key)

print("Generating CSV files for neo loading...")


import OxoCsvBuilder
builder = OxoCsvBuilder.Builder()

builder.exportTermsToCsv(exportFileTerms, terms)
builder.exportMappingsToCsv(exportFileMappings, postMappings, prefixToDatasource)

Expand Down
8 changes: 3 additions & 5 deletions oxo-loader/OxoCsvBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,9 @@ def exportTermsToCsv(self, file, terms):
label = None
uri = None

try:
if term["label"] is not None:
label = term["label"].encode('utf-8', errors="ignore")
except:
pass
if term["label"] is not None:
#https://docs.python.org/release/3.0.1/whatsnew/3.0.html#text-vs-data-instead-of-unicode-vs-8-bit
label = term["label"]

if term["uri"] is not None:
uri = term["uri"]
Expand Down

0 comments on commit d782373

Please sign in to comment.