Merge pull request #1005 from TomConlin/master

ClinVar
monarch-initiative · Dec 31, 2020 · 7c4cbb8 · 7c4cbb8
2 parents a6bde60 + 855e551
commit 7c4cbb8
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 65 deletions.
diff --git a/dipper/sources/ClinVar.py b/dipper/sources/ClinVar.py
@@ -21,7 +21,7 @@
         get a list of RCV    default CV_test_RCV.txt
         put the input files the raw directory
         write the test set back to the raw directory
-    ./scripts/ClinVarXML_Subset.sh | gzip > raw/clinvarxml_alpha/ClinVarTestSet.xml.gz
+    ./scripts/ClinVarXML_Subset.sh | gzip > raw/clinvar/ClinVarTestSet.xml.gz
 
 
     parsing a test set  (Skolemizing blank nodes  i.e. for Protege)
@@ -30,7 +30,9 @@
     For while we are still required to redundantly conflate the owl properties
     in with the data files.
 
-    python3 ./scripts/add-properties2turtle.py --input ./out/ClinVarTestSet_`datestamp`.nt --output ./out/ClinVarTestSet_`datestamp`.nt --format nt
+    python3 ./scripts/add-properties2turtle.py \
+        --input ./out/ClinVarTestSet_`datestamp`.nt \
+        --output ./out/ClinVarTestSet_`datestamp`.nt --format nt
 
 """
 
@@ -41,7 +43,7 @@
 import hashlib
 import logging
 import argparse
-import xml.etree.ElementTree as ET
+import xml.etree.ElementTree as ElementTree
 from typing import List, Dict
 import yaml
 from dipper.models.ClinVarRecord import ClinVarRecord, Gene,\
@@ -83,9 +85,7 @@
 CURIERE = re.compile(r'^.*:[A-Za-z0-9_][A-Za-z0-9_.]*[A-Za-z0-9_]*$')
 
 
-def make_spo(sub, prd, obj,
-             subject_category=None,
-             object_category=None):
+def make_spo(sub, prd, obj, subject_category=None, object_category=None):
     """
     Decorates the three given strings as a line of ntriples
     (also writes a triple for subj biolink:category and
@@ -100,13 +100,13 @@ def make_spo(sub, prd, obj,
         prd = 'rdf:type'
 
     try:
-        (subcuri, subid) = re.split(r':', sub)
+        (subcuri, subid) = sub.split(r':')
     except Exception:
         LOG.error("not a Subject Curie  '%s'", sub)
         raise ValueError
 
     try:
-        (prdcuri, prdid) = re.split(r':', prd)
+        (prdcuri, prdid) = prd.split(r':')
     except Exception:
         LOG.error("not a Predicate Curie  '%s'", prd)
         raise ValueError
@@ -115,12 +115,14 @@ def make_spo(sub, prd, obj,
 
     # object is a curie or bnode or literal [string|number] NOT None.
     assert (obj is not None), '"None" object for subject ' + sub + ' & pred ' + prd
+    # object is NOT empty.
+    assert (obj != ''), 'EMPTY object for subject ' + sub + ' & pred ' + prd
 
     if sub is None:
         LOG.error("make_spo() was passed sub of None!")
         return ""
-    if obj is None:
-        LOG.error("make_spo() was passed obj of None")
+    if obj is None or obj == '':
+        LOG.error("make_spo() was passed obj of None/empty")
         return ""
 
     objcuri = None
@@ -142,8 +144,9 @@ def make_spo(sub, prd, obj,
     else:
         # Literals may not contain the characters ", LF, CR '\'
         # except in their escaped forms. internal quotes as well.
+        # for downstream sanity any control chars should be escaped
         obj = obj.strip('"').replace('\\', '\\\\').replace('"', '\'')
-        obj = obj.replace('\n', '\\n').replace('\r', '\\r')
+        obj = obj.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
         objt = '"' + obj + '"'
 
     # allow unexpanded bnodes in subject
@@ -154,7 +157,7 @@ def make_spo(sub, prd, obj,
             subjt = '<' + subjt + '>'
     else:
         raise ValueError(
-            "Cant work with: <{}> {} , <{}> {}, {}".format(
+            "Can not work with: <{}> {} , <{}> {}, {}".format(
                 subcuri, subid, prdcuri, prdid, objt))
 
     triples = subjt + ' <' + CURIEMAP[prdcuri] + prdid.strip() + '> ' + objt + " .\n"
@@ -222,9 +225,9 @@ def write_spo(sub, prd, obj, triples, subject_category=None, object_category=Non
     """
         write triples to a buffer in case we decide to drop them
     """
-    triples.append(make_spo(sub, prd, obj,
-                            subject_category=subject_category,
-                            object_category=object_category))
+    triples.append(make_spo(
+        sub, prd, obj,
+        subject_category=subject_category, object_category=object_category))
 
 
 def scv_link(scv_sig, rcv_trip):
@@ -294,14 +297,21 @@ def process_measure_set(measure_set, rcv_acc) -> Variant:
     :param rcv_acc: str rcv accession
     :return: Variant object
     """
-    rcv_variant_id = measure_set.get('ID')
+    rcv_variant_id = measure_set.get('ID')      # Short integer accession
+    # rcv_variant_acc = measure_set.get('Acc')  # Long namespaced-zeropadded identifier
     measure_set_type = measure_set.get('Type')
 
     # Create Variant object
     rcv_variant_id = 'ClinVarVariant:' + rcv_variant_id
     variant = Variant(id=rcv_variant_id)
 
-    if measure_set_type in ["Haplotype", "Phase unknown", "Distinct chromosomes", "Haplotype, single variant"]:
+    if measure_set_type in [
+            "Haplotype",
+            "Phase unknown",
+            "Distinct chromosomes",
+            "Haplotype, single variant",
+            # "Variant",  # see below
+    ]:
         variant.variant_type = measure_set_type
     elif measure_set_type == "Variant":
         # We will attempt to infer the type
@@ -317,8 +327,7 @@ def process_measure_set(measure_set, rcv_acc) -> Variant:
         if allele_name is not None:
             rcv_allele_label = allele_name.text
         # else:
-        #    LOG.warning(
-        #       rcv_acc + " VARIANT MISSING LABEL")
+        #    LOG.warning(rcv_acc + " VARIANT MISSING LABEL")
 
         allele_type = rcv_measure.get('Type').strip()
 
@@ -360,7 +369,7 @@ def process_measure_set(measure_set, rcv_acc) -> Variant:
         rcv_allele_rels = rcv_measure.findall('./MeasureRelationship')
 
         if rcv_allele_rels is None:  # try letting them all through
-            LOG.info(ET.tostring(rcv_measure).decode('utf-8'))
+            LOG.info(ElementTree.tostring(rcv_measure).decode('utf-8'))
         else:
             for measure in rcv_allele_rels:
                 allele_rel_type = measure.get('Type').strip()
@@ -403,6 +412,10 @@ def resolve(label):
     in order of preference
     return g(f(x))|f(x)|g(x) | x
     TODO consider returning x on fall through
+
+    # the decendent resolve(label) function in Source.py
+    # should be used instead and this f(x) removed
+
     : return label's mapping
 
     '''
@@ -429,8 +442,9 @@ def allele_to_triples(allele, triples) -> None:
     :return: None
     """
 
-    write_spo(allele.id, GLOBALTT['type'], resolve(allele.variant_type), triples,
-              subject_category=blv.terms['SequenceVariant'])
+    write_spo(
+        allele.id, GLOBALTT['type'], resolve(allele.variant_type), triples,
+        subject_category=blv.terms['SequenceVariant'])
     write_spo(allele.id, GLOBALTT['in taxon'], GLOBALTT['Homo sapiens'], triples)
     if allele.label is not None:
         write_spo(allele.id, GLOBALTT['label'], allele.label, triples)
@@ -674,7 +688,7 @@ def parse():
     # INPUT
     argparser.add_argument(
         '-f', '--filename', default=files['f1']['file'],
-        help="input filename. default: '" + files['f1']['file'] + "'")
+        help="gziped .xml input filename. default: '" + files['f1']['file'] + "'")
 
     argparser.add_argument(
         '-m', '--mapfile', default=files['f2']['file'],
@@ -685,14 +699,12 @@ def parse():
         help="path to input file. default: '" + RPATH + '/raw/' + INAME + "'")
 
     argparser.add_argument(
-        '-l', "--localtt",
-        help="'spud'\t'potato'   default: " +
-        RPATH + '/translationtable/' + INAME + '.yaml')
+        '-l', "--localtt", default=LOCAL_TT_PATH,
+        help="'spud'\t'potato'   default: " + LOCAL_TT_PATH)
 
     argparser.add_argument(
-        '-g', "--globaltt",
-        help="'potato'\t'PREFIX:p123'   default: " +
-        RPATH + '/translationtable/GLOBAL_TERM.yaml')
+        '-g', "--globaltt", default=GLOBAL_TT_PATH,
+        help="'potato'\t'PREFIX:p123'   default: " + GLOBAL_TT_PATH)
 
     # output '/dev/stdout' would be my first choice
     argparser.add_argument(
@@ -709,8 +721,8 @@ def parse():
 
     args = argparser.parse_args()
 
-    basename = re.sub(r'\.xml.gz$', '', args.filename)
-    filename = args.inputdir + '/' + args.filename
+    basename = re.sub(r'\.xml.gz$', '', args.filename)  #
+    filename = args.inputdir + '/' + args.filename      # gziped xml input file
     mapfile = args.inputdir + '/' + args.mapfile
 
     # be sure I/O paths exist
@@ -738,8 +750,8 @@ def parse():
     output = args.destination + '/' + args.output
 
     # catch and release input for future study
-    reject = args.inputdir + '/' + basename + '_reject.xml'
-    # ignore = args.inputdir + '/' + basename + '_ignore.txt'  # unused
+    reject = RPATH + '/' + basename + '_reject.xml'
+    # ignore = args.inputdir + '/' + INAME + '_ignore.txt'  # unused
     try:
         os.remove(reject)
     except FileNotFoundError:
@@ -810,7 +822,8 @@ def parse():
     # main loop over xml
     # taken in chunks composed of ClinVarSet stanzas
     with gzip.open(filename, 'rt') as clinvar_fh:
-        tree = ET.iterparse(clinvar_fh)  # w/o specifing events it defaults to 'end'
+        # w/o specifing events it defaults to 'end'
+        tree = ElementTree.iterparse(clinvar_fh)
         for event, element in tree:
             if element.tag != 'ClinVarSet':
                 ReleaseSet = element
@@ -1007,20 +1020,19 @@ def parse():
             # check that no members of rcv.genovar are none
             # and that at least one condition has an id and db
             if [1 for member in vars(rcv.genovar) if member is None] \
-                    or not [
-                        1 for condition in rcv.conditions
-                        if condition.id is not None and
-                        condition.database is not None]:
+                or not [
+                    1 for condition in rcv.conditions
+                    if condition.id is not None and condition.database is not None]:
                 LOG.info('%s is under specified. SKIPPING', rcv_acc)
                 rjct_cnt += 1
                 # Write this Clinvar set out so we can know what we are missing
                 print(
                     # minidom.parseString(
-                    #    ET.tostring(
+                    #    ElementTree.tostring(
                     #        ClinVarSet)).toprettyxml(
                     #           indent="   "), file=reject)
                     #  too slow. doubles time
-                    ET.tostring(ClinVarSet).decode('utf-8'), file=reject)
+                    ElementTree.tostring(ClinVarSet).decode('utf-8'), file=reject)
                 ClinVarSet.clear()
                 continue
 
@@ -1124,13 +1136,13 @@ def parse():
 
                     # <ClinVarVariant:rcv_variant_id><rdfs:label><rcv.variant.label>  .
 
-                    # <monarch_assoc><OBAN:association_has_object><rcv_disease_curi>  .
+                    # <monarch_assoc><OBAN:association_has_object><rcv_disease_curie>  .
                     write_spo(
                         monarch_assoc, GLOBALTT['association has object'],
                         rcv_disease_curie, rcvtriples,
                         subject_category=blv.terms['Association'],
                         object_category=blv.terms['Disease'])
-                    # <rcv_disease_curi><rdfs:label><rcv_disease_label>  .
+                    # <rcv_disease_curie><rdfs:label><rcv_disease_label>  .
                     # medgen might not have a disease label
                     if condition.label is not None:
                         write_spo(
@@ -1327,7 +1339,7 @@ def parse():
 
                     scv_significance = scv_geno = None
                     SCV_Description = ClinicalSignificance.find('./Description')
-                    if SCV_Description is not None:
+                    if SCV_Description not in ['not provided', None]:
                         scv_significance = SCV_Description.text.strip()
                         scv_geno = resolve(scv_significance)
                         unkwn = 'has_uncertain_significance_for_condition'
@@ -1371,10 +1383,15 @@ def parse():
                         else:
                             del rcvtriples[:]
                             continue
+                    else:
+                        del rcvtriples[:]
+                        continue
+
                     # if we have deleted the triples buffer then
                     # there is no point in continueing  (I don't think)
                     if not rcvtriples:
                         continue
+
                     # scv_assert_type = SCV_Assertion.find('./Assertion').get('Type')
                     # check scv_assert_type == 'variation to disease'?
                     # /SCV/ObservedIn/ObservedData/Citation/'ID[@Source="PubMed"]

diff --git a/scripts/ClinVarXML_Subset.py b/scripts/ClinVarXML_Subset.py
@@ -19,10 +19,11 @@
 IPATH = re.split(r'/', os.path.realpath(__file__))
 (INAME, DOTPY) = re.split(r'\.', IPATH[-1].lower())
 RPATH = '/' + '/'.join(IPATH[1:-2])
+FTP = 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml'
 files = {
     'f1': {
         'file': 'ClinVarFullRelease_00-latest.xml.gz',
-        'url': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz'}
+        'url': '/'.join((FTP, 'ClinVarFullRelease_00-latest.xml.gz'))}
 }
 
 # handle arguments for IO
@@ -34,40 +35,40 @@
     help="input filename. default: '" + files['f1']['file'] + "'")
 
 ARGPARSER.add_argument(
-    '-i', '--inputdir', default=RPATH + '/raw/clinvarxml_alpha',
-    help="input path. default: '" + RPATH + '/raw/clinvarxml_alpha' "'")
+    '-i', '--inputdir', default=RPATH + '/raw/clinvar',
+    help="input path. default: '" + RPATH + '/raw/clinvar' "'")
 
 # OUTPUT
 ARGPARSER.add_argument(
-    '-d', "--destination", default=RPATH + '/raw/clinvarxml_alpha',
-    help='output path. default: "' + RPATH + '/raw/clinvarxml_alpha')
+    '-d', "--destination", default=RPATH + '/raw/clinvar',
+    help='output path. default: "' + RPATH + '/raw/clinvar')
 
 ARGPARSER.add_argument(
     '-o', "--output", default=INAME + '.xml',
     help='file name to write to')
 
 ARGPARSER.add_argument(
-    '-t', "--testfile", default=RPATH + '/raw/test/testid.txt',
-    help='file of IDs to capture')
+    '-t', "--testfile", default=RPATH + '/raw/test/clinvar_curie.txt',
+    help='file w/list of CURIEs to capture')
 
 ARGS = ARGPARSER.parse_args()
 
 FILENAME = ARGS.inputdir + '/' + ARGS.filename
 
 OUTPUT = ARGS.destination + '/' + ARGS.output
 
-VARIANT = []
-DISEASE = []
-GENE = []
-RCV = []
+VARIANT = []    # VCV
+DISEASE = []    # OMIM ...
+GENE = []       # NCBIGene  ...
+RCV = []        #
 
 with open(ARGS.testfile) as f:
     for line in f:
         line = line.partition('#')[0].strip()  # no comment
         if line != "":
             (prfx, lcl_id) = re.split(r':', line, 2)
             #
-            if prfx == 'ClinVar':
+            if prfx in ['RCV', 'ClinVar']:
                 RCV.append(lcl_id.strip())
 
             elif prfx == 'ClinVarVariant':
@@ -125,7 +126,7 @@
         # /RCV/MeasureSet/Measure/MeasureRelationship[@Type]/XRef[@DB="Gene"]/@ID
         RCV_MeasureSet = RCVAssertion.find('./MeasureSet')
         if RCV_MeasureSet is None:
-            #GenotypeSet
+            # GenotypeSet
             continue
         else:
             for rms in RCV_MeasureSet:

diff --git a/scripts/ClinVarXML_Subset.sh b/scripts/ClinVarXML_Subset.sh
@@ -1,5 +1,12 @@
 #! /bin/bash
 
+# 2020 DEPRECIATE.  (could still work on subsets)
+
+# note  ClinVar has gotten to big to to process without streaming on normal hardware
+# use the python element tree version instead of this
+# unless you have hundreds of gigs of ram
+
+
 # extract ClinVarSets containing specific RCV|SCV
 # not at all efficent, but does not need to be.
 # give a list of RCV|SCV identifiers and a dataset to find them in