Skip to content

Commit

Permalink
refactor, add ClinVarSet and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed Aug 16, 2024
1 parent 304b8df commit 2100310
Show file tree
Hide file tree
Showing 10 changed files with 228 additions and 115 deletions.
2 changes: 1 addition & 1 deletion bin/cmat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.2.2
3.3.0.dev
26 changes: 15 additions & 11 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import re
import xml.etree.ElementTree as ElementTree
from functools import cached_property
from xml.dom import minidom

from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError
Expand Down Expand Up @@ -43,16 +44,6 @@ def __init__(self, record_xml, xsd_version, trait_class=ClinVarTrait, measure_cl
else:
self.measure = measure_class(variant_measure, self)

# List of clinical classifications (Germline, Somatic, or Oncogenecity)
self.clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
self.clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.record_xml, './Classifications/*'):
self.clinical_classifications.append(ClinicalClassification(clin_class, self))

def __str__(self):
return f'ClinVarRecord object with accession {self.accession}'

Expand All @@ -70,7 +61,7 @@ def accession(self):
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['Acc']

@property
def date(self):
def last_updated_date(self):
"""This tracks the latest update date, counting even minor technical updates."""
return self.record_xml.attrib['DateLastUpdated']

Expand Down Expand Up @@ -117,6 +108,19 @@ def valid_allele_origins(self):
"""Returns all valid allele origins, i.e. ones that are not in the list of nonspecific terms."""
return {origin for origin in self.allele_origins if origin.lower() not in self.NONSPECIFIC_ALLELE_ORIGINS}

@cached_property
def clinical_classifications(self):
"""List of clinical classifications (Germline, Somatic, or Oncogenecity)"""
clinical_classifications = []
if self.xsd_version < 2:
# V1 only ever has a single clinical classification / clinical significance
clinical_classifications.append(
ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self))
else:
for clin_class in find_elements(self.record_xml, './Classifications/*'):
clinical_classifications.append(ClinicalClassification(clin_class, self))
return clinical_classifications

# The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord
# if there is exactly one ClinicalClassification for the record.
# Otherwise these should be taken from the ClinicalClassification objects directly.
Expand Down
30 changes: 30 additions & 0 deletions cmat/clinvar_xml_io/clinvar_set.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.clinvar_submitted_record import ClinVarSubmittedRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements


class ClinVarSet:
"""
A ClinVarSet groups together a single reference record (RCV) and one or more submitted records (SCVs).
"""

def __init__(self, cvs_xml, xsd_version):
self.cvs_xml = cvs_xml

rcv_elem = find_mandatory_unique_element(self.cvs_xml, 'ReferenceClinVarAssertion')
self.rcv = ClinVarRecord(rcv_elem, xsd_version)

scv_elems = find_elements(self.cvs_xml, 'ClinVarAssertion', allow_zero=False, allow_multiple=True)
self.scvs = [ClinVarSubmittedRecord(elem, xsd_version, self.rcv) for elem in scv_elems]

@property
def id(self):
return self.cvs_xml.attrib['ID']

@property
def title(self):
return find_mandatory_unique_element(self.cvs_xml, './Title').text

@property
def status(self):
return find_mandatory_unique_element(self.cvs_xml, './RecordStatus').text
36 changes: 31 additions & 5 deletions cmat/clinvar_xml_io/clinvar_submitted_record.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from functools import cached_property

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
Expand All @@ -9,9 +10,9 @@

class ClinVarSubmittedRecord(ClinVarRecord):
"""
Submitted records (SCVs) are structured similarly to reference records (RCVs), though typically with fewer
annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are added by curators.
However, these attributes are also technically optional in the RCVs so the code inheritance is possible.
Submitted records (SCVs) are structured similarly to reference records (RCVs) with a few exceptions, though they
typically have fewer annotations - for example, variant coordinates, HGVS expressions or ontology mappings which are
added by curators.
SCVs also contain additional information about the actual submission, which we model in this class.
"""
Expand All @@ -21,15 +22,40 @@ def __init__(self, record_xml, xsd_version, reference_record):
# Each SCV is associated with a single RCV
self.reference_record = reference_record

def __str__(self):
return f'ClinVarSubmittedRecord object with accession {self.accession}'

@property
def submission_date(self):
"""Date of submission or when submission was last revised (for first submission, use created_date)."""
return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitterDate']

@property
def last_updated_date(self):
"""Overrides parent definition, in SCV this date is in the accession element"""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated']

@property
def created_date(self):
"""Overrides parent definition, in SCV this date is in the accession element"""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated']

@property
def submitter(self):
"""Name of the submitting organization."""
return find_mandatory_unique_element(self.record_xml, './ClinVarSubmissionID').attrib['submitter']

@property
def submitter_id(self):
"""Numeric identifier associated with the submitting organization."""
return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['OrgID']

@property
def submission_name(self):
# TODO - check whether this is the correct property to filter on
return self.record_xml.attrib['SubmissionName']
"""Name or identifier associated with the submission. This is optional."""
return self.record_xml.attrib.get('SubmissionName', None)

@cached_property
def clinical_classifications(self):
# Submitted record clinical classifications are defined a bit differently than reference records
raise NotImplementedError('Clinical classification parsing not implemented for SCVs')
16 changes: 10 additions & 6 deletions cmat/clinvar_xml_io/xml_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,22 @@ def parse_header_attributes(clinvar_xml):

def iterate_rcv_from_xml(clinvar_xml):
"""Iterates through the gzipped ClinVar XML and yields complete <ReferenceClinVarAssertion> records."""
for cvs in iterate_cvs_from_xml(clinvar_xml):
# Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
# ClinVar. There should only be one such record per ClinVarSet.
rcv = find_mandatory_unique_element(cvs, 'ReferenceClinVarAssertion')
yield rcv


def iterate_cvs_from_xml(clinvar_xml):
"""Iterates through the gzipped ClinVar XML and yields complete <ClinVarSet> elements."""
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
# Wait until we have built a complete ClinVarSet element
if elem.tag != 'ClinVarSet':
continue

# Go to a ReferenceClinVarAssertion element. This corresponds to a single RCV record, the main unit of
# ClinVar. There should only be one such record per ClinVarSet.
rcv = find_mandatory_unique_element(elem, 'ReferenceClinVarAssertion')

# Return the complete record and then remove the processed element from the tree to save memory
yield rcv
yield elem
elem.clear()


Expand Down
14 changes: 1 addition & 13 deletions data-exploration/filter_clinvar_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import xml.etree.ElementTree as ElementTree

from cmat.clinvar_xml_io import ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element
from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, iterate_cvs_from_xml
from cmat.output_generation.clinvar_to_evidence_strings import get_consequence_types
from cmat.output_generation.consequence_type import process_consequence_type_file

Expand All @@ -18,18 +18,6 @@ def pprint(x):
print(ElementTree.tostring(x, encoding='unicode'))


def iterate_cvs_from_xml(clinvar_xml):
"""Similar to iterate_rcv_from_xml in clinvar_xml_utils, but keeps the entire ClinVarSet XML element.
This allows us to construct a valid ClinVar XML for easy future processing."""
with gzip.open(clinvar_xml, 'rt') as fh:
for event, elem in ElementTree.iterparse(fh):
# Wait until we have built a complete ClinVarSet element
if elem.tag != 'ClinVarSet':
continue
yield elem
elem.clear()


def filter_xml(input_xml, output_xml, filter_fct, max_num=None):
""" Filter input_xml by boolean condition defined by filter_fct and write to output_xml.
If max_num is given, will write at most max_num records, otherwise writes all."""
Expand Down
40 changes: 40 additions & 0 deletions tests/clinvar_xml_io/test_clinvar_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os

from cmat.clinvar_xml_io import ClinVarDataset

resources_dir = os.path.join(os.path.dirname(__file__), 'resources')


class TestClinvarRecordMeasure:
@classmethod
def setup_class(cls):
input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
cls.test_crm = next(iter(ClinVarDataset(input_file))).measure

def test_hgvs(self):
text_hgvs = [h.text for h in self.test_crm.all_hgvs]
assert text_hgvs == ['NM_152443.3:c.677A>G',
'NG_008321.1:g.32324A>G',
'NC_000014.9:g.67729209A>G',
'NC_000014.8:g.68195926A>G',
'NM_152443.2:c.677A>G',
'Q96NR8:p.Tyr226Cys',
'NP_689656.2:p.Tyr226Cys']

def test_preferred_current_hgvs(self):
assert self.test_crm.preferred_current_hgvs.text == 'NC_000014.9:g.67729209A>G'

def test_rs(self):
assert self.test_crm.rs_id == 'rs28940313'

def test_nsv(self):
assert self.test_crm.nsv_id is None

def test_variant_type(self):
assert self.test_crm.variant_type == 'single nucleotide variant'

def test_measure_set_pubmed_refs(self):
assert self.test_crm.pubmed_refs == []

def test_so_terms(self):
assert self.test_crm.existing_so_terms == {'SO:0001583'}
42 changes: 42 additions & 0 deletions tests/clinvar_xml_io/test_clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,45 @@ def test_multiple_clinical_classifications_record():
assert set(cc.type for cc in record.clinical_classifications) == {'GermlineClassification', 'SomaticClinicalImpact'}
with pytest.raises(MultipleClinicalClassificationsError):
print(record.valid_clinical_significances)


class TestClinvarRecord:
@classmethod
def setup_class(cls):
input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
cls.test_clinvar_record = next(iter(ClinVarDataset(input_file)))

def test_date(self):
"""Check that the last updated date of the referenceClinVarAssertion is loaded correctly"""
assert self.test_clinvar_record.last_updated_date == '2024-04-15'

def test_score(self):
assert self.test_clinvar_record.score == 2

def test_review_status(self):
assert self.test_clinvar_record.review_status == 'criteria provided, multiple submitters, no conflicts'

def test_acc(self):
assert self.test_clinvar_record.accession == 'RCV000002127'

def test_traits(self):
assert self.test_clinvar_record.traits[0].preferred_name == 'Leber congenital amaurosis 13'
assert self.test_clinvar_record.traits[0].preferred_or_other_valid_name == 'Leber congenital amaurosis 13'

def test_trait_pubmed_refs(self):
assert self.test_clinvar_record.traits[0].pubmed_refs == [20301590, 30285347]

def test_observed_pubmed_refs(self):
assert self.test_clinvar_record.evidence_support_pubmed_refs == [15258582, 15322982]

def test_clinical_significance(self):
assert self.test_clinvar_record.clinical_significance_list == ['likely pathogenic', 'pathogenic']

def test_allele_origins(self):
assert self.test_clinvar_record.allele_origins == {'germline', 'inherited', 'unknown'}

def test_valid_allele_origins(self):
assert self.test_clinvar_record.valid_allele_origins == {'germline', 'inherited'}

def test_trait_efo_ids(self):
assert self.test_clinvar_record.traits[0].current_efo_aligned_xrefs == [('MONDO', 'MONDO:0012990', 'current')]
58 changes: 58 additions & 0 deletions tests/clinvar_xml_io/test_clinvar_submitted_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os

import pytest

from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
from cmat.clinvar_xml_io.xml_parsing import iterate_cvs_from_xml


@pytest.fixture
def clinvar_set():
resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')
return ClinVarSet(next(iterate_cvs_from_xml(input_file)), 2.0)


@pytest.fixture
def submitted_record(clinvar_set):
return clinvar_set.scvs[0]


def test_clinvar_set(clinvar_set):
assert clinvar_set.rcv.accession == 'RCV000002127'
assert len(clinvar_set.scvs) == 5
assert clinvar_set.id == '188870850'
assert clinvar_set.title == 'NM_152443.3(RDH12):c.677A>G (p.Tyr226Cys) AND Leber congenital amaurosis 13'
assert clinvar_set.status == 'current'


def test_clinvar_submitted_record(submitted_record):
assert submitted_record.accession == 'SCV000022285'
assert submitted_record.submitter == 'OMIM'
assert submitted_record.valid_allele_origins == {'germline'}
assert submitted_record.evidence_support_pubmed_refs == [15258582, 15322982]

assert submitted_record.created_date == '2013-04-04' # submission first publicly available
assert submitted_record.submission_date == '2015-07-02' # submission last revised
assert submitted_record.last_updated_date == '2015-07-05' # submission last revision publicly available

with pytest.raises(NotImplementedError):
assert submitted_record.valid_clinical_significances


def test_clinvar_submitted_record_trait(submitted_record):
assert len(submitted_record.traits_with_valid_names) == 1
scv_trait = submitted_record.traits_with_valid_names[0]

assert scv_trait.preferred_or_other_valid_name == 'LEBER CONGENITAL AMAUROSIS 13'
assert scv_trait.current_efo_aligned_xrefs == []


def test_clinvar_submitted_record_measure(submitted_record):
assert submitted_record.measure is not None
scv_measure = submitted_record.measure

assert scv_measure.preferred_or_other_name == 'RDH12, TYR226CYS'
assert scv_measure.preferred_current_hgvs is None
assert not scv_measure.has_complete_coordinates
assert scv_measure.variant_type == 'Variation'
Loading

0 comments on commit 2100310

Please sign in to comment.