refactor, add ClinVarSet and tests

EBIvariation · Aug 16, 2024 · 2100310 · 2100310
1 parent 304b8df
commit 2100310
Show file tree

Hide file tree

Showing 10 changed files with 228 additions and 115 deletions.
diff --git a/bin/cmat/VERSION b/bin/cmat/VERSION
@@ -1 +1 @@
-3.2.2
+3.3.0.dev
diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py
@@ -1,6 +1,7 @@
 import logging
 import re
 import xml.etree.ElementTree as ElementTree
+from functools import cached_property
 from xml.dom import minidom
 
 from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
@@ -43,16 +44,6 @@ def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_cl
         else:
             self.measure = measure_class(variant_measure, self)
 
-        # List of clinical classifications (Germline, Somatic, or Oncogenecity)
-        self.clinical_classifications = []
-        if self.xsd_version < 2:
-            # V1 only ever has a single clinical classification / clinical significance
-            self.clinical_classifications.append(
-                ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
-        else:
-            for clin_class in find_elements(self.record_xml, './Classifications/*'):
-                self.clinical_classifications.append(ClinicalClassification(clin_class, self))
-
     def __str__(self):
         return f'ClinVarRecord object with accession {self.accession}'
 
@@ -70,7 +61,7 @@ def accession(self):
         return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']
 
     @property
-    def date(self):
+    def last_updated_date(self):
         """This tracks the latest update date, counting even minor technical updates."""
         return self.record_xml.attrib['DateLastUpdated']
 
@@ -117,6 +108,19 @@ def valid_allele_origins(self):
         """Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
         return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}
 
+    @cached_property
+    def clinical_classifications(self):
+        """List of clinical classifications (Germline, Somatic, or Oncogenecity)"""
+        clinical_classifications = []
+        if self.xsd_version < 2:
+            # V1 only ever has a single clinical classification / clinical significance
+            clinical_classifications.append(
+                ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
+        else:
+            for clin_class in find_elements(self.record_xml, './Classifications/*'):
+                clinical_classifications.append(ClinicalClassification(clin_class, self))
+        return clinical_classifications
+
     # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
     # if there is exactly one ClinicalClassification for the record.
     # Otherwise these should be taken from the ClinicalClassification objects directly.

diff --git a/cmat/clinvar_xml_io/clinvar_set.py b/cmat/clinvar_xml_io/clinvar_set.py
@@ -0,0 +1,30 @@
+from cmat.clinvar_xml_io import ClinVarRecord
+from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements
+
+
+class ClinVarSet:
+    """
+    A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs).
+    """
+
+    def __init__(self, cvs_xml, xsd_version):
+        self.cvs_xml = cvs_xml
+
+        rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
+        self.rcv = ClinVarRecord(rcv_elem, xsd_version)
+
+        scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
+        self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]
+
+    @property
+    def id(self):
+        return self.cvs_xml.attrib['ID']
+
+    @property
+    def title(self):
+        return find_mandatory_unique_element(self.cvs_xml, './Title').text
+
+    @property
+    def status(self):
+        return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text
diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py
@@ -1,4 +1,5 @@
 import logging
+from functools import cached_property
 
 from cmat.clinvar_xml_io import ClinVarRecord
 from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
@@ -9,9 +10,9 @@
 
 class ClinVarSubmittedRecord(ClinVarRecord):
     """
-    Submitted records (SCVs) are structured similarly to reference records (RCVs), though typically with fewer
-    annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are added by curators.
-    However, these attributes are also technically optional in the RCVs so the code inheritance is possible.
+    Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they
+    typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are
+    added by curators.
 
     SCVs also contain additional information about the actual submission, which we model in this class.
     """
@@ -21,15 +22,40 @@ def __init__(self, record_xml, xsd_version, reference_record):
         # Each SCV is associated with a single RCV
         self.reference_record = reference_record
 
+    def __str__(self):
+        return f'ClinVarSubmittedRecord object with accession {self.accession}'
+
     @property
     def submission_date(self):
+        """Date of submission or when submission was last revised (for first submission, use created_date)."""
         return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate']
 
+    @property
+    def last_updated_date(self):
+        """Overrides parent definition, in SCV this date is in the accession element"""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated']
+
+    @property
+    def created_date(self):
+        """Overrides parent definition, in SCV this date is in the accession element"""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated']
+
     @property
     def submitter(self):
+        """Name of the submitting organization."""
         return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter']
 
+    @property
+    def submitter_id(self):
+        """Numeric identifier associated with the submitting organization."""
+        return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID']
+
     @property
     def submission_name(self):
-        # TODO - check whether this is the correct property to filter on
-        return self.record_xml.attrib['SubmissionName']
+        """Name or identifier associated with the submission. This is optional."""
+        return self.record_xml.attrib.get('SubmissionName', None)
+
+    @cached_property
+    def clinical_classifications(self):
+        # Submitted record clinical classifications are defined a bit differently than reference records
+        raise NotImplementedError('Clinical classification parsing not implemented for SCVs')
diff --git a/cmat/clinvar_xml_io/xml_parsing.py b/cmat/clinvar_xml_io/xml_parsing.py
@@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml):
 
 def iterate_rcv_from_xml(clinvar_xml):
     """Iterates through the gzipped ClinVar XML and yields complete <ReferenceClinVarAssertion> records."""
+    for cvs in iterate_cvs_from_xml(clinvar_xml):
+        # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
+        # ClinVar. There should only be one such record per ClinVarSet.
+        rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion')
+        yield rcv
+
+
+def iterate_cvs_from_xml(clinvar_xml):
+    """Iterates through the gzipped ClinVar XML and yields complete <ClinVarSet> elements."""
     with gzip.open(clinvar_xml, 'rt') as fh:
         for event, elem in ElementTree.iterparse(fh):
             # Wait until we have built a complete ClinVarSet element
             if elem.tag != 'ClinVarSet':
                 continue
-
-            # Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
-            # ClinVar. There should only be one such record per ClinVarSet.
-            rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion')
-
             # Return the complete record and then remove the processed element from the tree to save memory
-            yield rcv
+            yield elem
             elem.clear()
 
 

diff --git a/data-exploration/filter_clinvar_xml.py b/data-exploration/filter_clinvar_xml.py
@@ -4,7 +4,7 @@
 import xml.etree.ElementTree as ElementTree
 
 from cmat.clinvar_xml_io import ClinVarRecord
-from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
+from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml
 from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types
 from cmat.output_generation.consequence_type import process_consequence_type_file
 
@@ -18,18 +18,6 @@ def pprint(x):
     print(ElementTree.tostring(x, encoding='unicode'))
 
 
-def iterate_cvs_from_xml(clinvar_xml):
-    """Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element.
-    This allows us to construct a valid ClinVar XML for easy future processing."""
-    with gzip.open(clinvar_xml, 'rt') as fh:
-        for event, elem in ElementTree.iterparse(fh):
-            # Wait until we have built a complete ClinVarSet element
-            if elem.tag != 'ClinVarSet':
-                continue
-            yield elem
-            elem.clear()
-
-
 def filter_xml(input_xml, output_xml, filter_fct, max_num=None):
     """ Filter input_xml by boolean condition defined by filter_fct and write to output_xml.
     If max_num is given, will write at most max_num records, otherwise writes all."""

diff --git a/tests/clinvar_xml_io/test_clinvar_measure.py b/tests/clinvar_xml_io/test_clinvar_measure.py
@@ -0,0 +1,40 @@
+import os
+
+from cmat.clinvar_xml_io import ClinVarDataset
+
+resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
+
+
+class TestClinvarRecordMeasure:
+    @classmethod
+    def setup_class(cls):
+        input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
+        cls.test_crm = next(iter(ClinVarDataset(input_file))).measure
+
+    def test_hgvs(self):
+        text_hgvs = [h.text for h in self.test_crm.all_hgvs]
+        assert text_hgvs == ['NM_152443.3:c.677A>G',
+                             'NG_008321.1:g.32324A>G',
+                             'NC_000014.9:g.67729209A>G',
+                             'NC_000014.8:g.68195926A>G',
+                             'NM_152443.2:c.677A>G',
+                             'Q96NR8:p.Tyr226Cys',
+                             'NP_689656.2:p.Tyr226Cys']
+
+    def test_preferred_current_hgvs(self):
+        assert self.test_crm.preferred_current_hgvs.text == 'NC_000014.9:g.67729209A>G'
+
+    def test_rs(self):
+        assert self.test_crm.rs_id == 'rs28940313'
+
+    def test_nsv(self):
+        assert self.test_crm.nsv_id is None
+
+    def test_variant_type(self):
+        assert self.test_crm.variant_type == 'single nucleotide variant'
+
+    def test_measure_set_pubmed_refs(self):
+        assert self.test_crm.pubmed_refs == []
+
+    def test_so_terms(self):
+        assert self.test_crm.existing_so_terms == {'SO:0001583'}
diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py
@@ -32,3 +32,45 @@ def test_multiple_clinical_classifications_record():
     assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'}
     with pytest.raises(MultipleClinicalClassificationsError):
         print(record.valid_clinical_significances)
+
+
+class TestClinvarRecord:
+    @classmethod
+    def setup_class(cls):
+        input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
+        cls.test_clinvar_record = next(iter(ClinVarDataset(input_file)))
+
+    def test_date(self):
+        """Check that the last updated date of the referenceClinVarAssertion is loaded correctly"""
+        assert self.test_clinvar_record.last_updated_date == '2024-04-15'
+
+    def test_score(self):
+        assert self.test_clinvar_record.score == 2
+
+    def test_review_status(self):
+        assert self.test_clinvar_record.review_status == 'criteria provided, multiple submitters, no conflicts'
+
+    def test_acc(self):
+        assert self.test_clinvar_record.accession == 'RCV000002127'
+
+    def test_traits(self):
+        assert self.test_clinvar_record.traits[0].preferred_name == 'Leber congenital amaurosis 13'
+        assert self.test_clinvar_record.traits[0].preferred_or_other_valid_name == 'Leber congenital amaurosis 13'
+
+    def test_trait_pubmed_refs(self):
+        assert self.test_clinvar_record.traits[0].pubmed_refs == [20301590, 30285347]
+
+    def test_observed_pubmed_refs(self):
+        assert self.test_clinvar_record.evidence_support_pubmed_refs == [15258582, 15322982]
+
+    def test_clinical_significance(self):
+        assert self.test_clinvar_record.clinical_significance_list == ['likely pathogenic', 'pathogenic']
+
+    def test_allele_origins(self):
+        assert self.test_clinvar_record.allele_origins == {'germline', 'inherited', 'unknown'}
+
+    def test_valid_allele_origins(self):
+        assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'}
+
+    def test_trait_efo_ids(self):
+        assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')]
diff --git a/tests/clinvar_xml_io/test_clinvar_submitted_record.py b/tests/clinvar_xml_io/test_clinvar_submitted_record.py
@@ -0,0 +1,58 @@
+import os
+
+import pytest
+
+from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
+from cmat.clinvar_xml_io.xml_parsing import iterate_cvs_from_xml
+
+
+@pytest.fixture
+def clinvar_set():
+    resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
+    input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
+    return ClinVarSet(next(iterate_cvs_from_xml(input_file)), 2.0)
+
+
+@pytest.fixture
+def submitted_record(clinvar_set):
+    return clinvar_set.scvs[0]
+
+
+def test_clinvar_set(clinvar_set):
+    assert clinvar_set.rcv.accession == 'RCV000002127'
+    assert len(clinvar_set.scvs) == 5
+    assert clinvar_set.id == '188870850'
+    assert clinvar_set.title == 'NM_152443.3(RDH12):c.677A>G (p.Tyr226Cys) AND Leber congenital amaurosis 13'
+    assert clinvar_set.status == 'current'
+
+
+def test_clinvar_submitted_record(submitted_record):
+    assert submitted_record.accession == 'SCV000022285'
+    assert submitted_record.submitter == 'OMIM'
+    assert submitted_record.valid_allele_origins == {'germline'}
+    assert submitted_record.evidence_support_pubmed_refs == [15258582, 15322982]
+
+    assert submitted_record.created_date == '2013-04-04'  # submission first publicly available
+    assert submitted_record.submission_date == '2015-07-02'  # submission last revised
+    assert submitted_record.last_updated_date == '2015-07-05'  # submission last revision publicly available
+
+    with pytest.raises(NotImplementedError):
+        assert submitted_record.valid_clinical_significances
+
+
+def test_clinvar_submitted_record_trait(submitted_record):
+    assert len(submitted_record.traits_with_valid_names) == 1
+    scv_trait = submitted_record.traits_with_valid_names[0]
+
+    assert scv_trait.preferred_or_other_valid_name == 'LEBER CONGENITAL AMAUROSIS 13'
+    assert scv_trait.current_efo_aligned_xrefs == []
+
+
+def test_clinvar_submitted_record_measure(submitted_record):
+    assert submitted_record.measure is not None
+    scv_measure = submitted_record.measure
+
+    assert scv_measure.preferred_or_other_name == 'RDH12, TYR226CYS'
+    assert scv_measure.preferred_current_hgvs is None
+    assert not scv_measure.has_complete_coordinates
+    assert scv_measure.variant_type == 'Variation'