diff --git a/cmat/clinvar_xml_io/clinvar_dataset.py b/cmat/clinvar_xml_io/clinvar_dataset.py index b436f11a..a896cbdd 100644 --- a/cmat/clinvar_xml_io/clinvar_dataset.py +++ b/cmat/clinvar_xml_io/clinvar_dataset.py @@ -3,7 +3,7 @@ import re from datetime import date -from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord +from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ def __init__(self, clinvar_xml): def __iter__(self): for rcv in iterate_rcv_from_xml(self.clinvar_xml): - yield ClinVarRecord(rcv, self.xsd_version) + yield ClinVarReferenceRecord(rcv, self.xsd_version) def get_xsd_version(self): # For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md diff --git a/cmat/clinvar_xml_io/clinvar_record.py b/cmat/clinvar_xml_io/clinvar_record.py index db0ac421..18635539 100644 --- a/cmat/clinvar_xml_io/clinvar_record.py +++ b/cmat/clinvar_xml_io/clinvar_record.py @@ -4,7 +4,7 @@ from functools import cached_property from xml.dom import minidom -from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification, MultipleClinicalClassificationsError +from cmat.clinvar_xml_io.clinical_classification import MultipleClinicalClassificationsError from cmat.clinvar_xml_io.clinvar_measure import ClinVarRecordMeasure from cmat.clinvar_xml_io.clinvar_trait import ClinVarTrait from cmat.clinvar_xml_io.xml_parsing import find_elements, find_optional_unique_element, \ @@ -15,10 +15,10 @@ class ClinVarRecord: - """Instances of this class hold data on individual ClinVar records. See also: - * /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model; - * Issue https://github.com/EBIvariation/eva-opentargets/issues/127 for the most recent discussions on changing - support of different ClinVar record types.""" + """ + Base class for both reference and submitted records in ClinVar. See also: + /data-exploration/clinvar-variant-types/README.md for the in-depth explanation of ClinVar data model + """ # Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful. NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'} @@ -62,13 +62,15 @@ def accession(self): @property def last_updated_date(self): - """This tracks the latest update date, counting even minor technical updates.""" - return self.record_xml.attrib['DateLastUpdated'] + """This tracks the latest update date, counting even minor technical updates. + Appears differently in reference and submitted records.""" + raise NotImplementedError @property def created_date(self): - """This tracks the date the record was first made public on ClinVar.""" - return self.record_xml.attrib['DateCreated'] + """This tracks the date the record was first made public on ClinVar. + Appears differently in reference and submitted records.""" + raise NotImplementedError @property def mode_of_inheritance(self): @@ -111,15 +113,7 @@ def valid_allele_origins(self): @cached_property def clinical_classifications(self): """List of clinical classifications (Germline, Somatic, or Oncogenecity)""" - clinical_classifications = [] - if self.xsd_version < 2: - # V1 only ever has a single clinical classification / clinical significance - clinical_classifications.append( - ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) - else: - for clin_class in find_elements(self.record_xml, './Classifications/*'): - clinical_classifications.append(ClinicalClassification(clin_class, self)) - return clinical_classifications + raise NotImplementedError # The following properties are maintained for backwards compatibility, but are only present for a ClinVarRecord # if there is exactly one ClinicalClassification for the record. diff --git a/cmat/clinvar_xml_io/clinvar_reference_record.py b/cmat/clinvar_xml_io/clinvar_reference_record.py new file mode 100644 index 00000000..90bd4f4e --- /dev/null +++ b/cmat/clinvar_xml_io/clinvar_reference_record.py @@ -0,0 +1,41 @@ +import logging +from functools import cached_property + +from cmat.clinvar_xml_io.clinical_classification import ClinicalClassification + +from cmat.clinvar_xml_io.clinvar_record import ClinVarRecord +from cmat.clinvar_xml_io.xml_parsing import find_mandatory_unique_element, find_elements + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class ClinVarReferenceRecord(ClinVarRecord): + """Reference records (RCVs) summarise information from submitted records (SCVs) and include additional annotations + and cross-references supplied by ClinVar.""" + + def __init__(self, record_xml, xsd_version): + super().__init__(record_xml, xsd_version) + + def __str__(self): + return f'ClinVarReferenceRecord object with accession {self.accession}' + + @property + def last_updated_date(self): + return self.record_xml.attrib['DateLastUpdated'] + + @property + def created_date(self): + return self.record_xml.attrib['DateCreated'] + + @cached_property + def clinical_classifications(self): + clinical_classifications = [] + if self.xsd_version < 2: + # V1 only ever has a single clinical classification / clinical significance + clinical_classifications.append( + ClinicalClassification(find_mandatory_unique_element(self.record_xml, './ClinicalSignificance'), self)) + else: + for clin_class in find_elements(self.record_xml, './Classifications/*'): + clinical_classifications.append(ClinicalClassification(clin_class, self)) + return clinical_classifications diff --git a/cmat/clinvar_xml_io/clinvar_submitted_record.py b/cmat/clinvar_xml_io/clinvar_submitted_record.py index 1d0264b6..7934cea3 100644 --- a/cmat/clinvar_xml_io/clinvar_submitted_record.py +++ b/cmat/clinvar_xml_io/clinvar_submitted_record.py @@ -32,12 +32,10 @@ def submission_date(self): @property def last_updated_date(self): - """Overrides parent definition, in SCV this date is in the accession element""" return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateUpdated'] @property def created_date(self): - """Overrides parent definition, in SCV this date is in the accession element""" return find_mandatory_unique_element(self.record_xml, './ClinVarAccession').attrib['DateCreated'] @property diff --git a/data-exploration/drug-response/drug-response-background-trait.ipynb b/data-exploration/drug-response/drug-response-background-trait.ipynb index ca9aab01..8db5a3bb 100644 --- a/data-exploration/drug-response/drug-response-background-trait.ipynb +++ b/data-exploration/drug-response/drug-response-background-trait.ipynb @@ -3087,4 +3087,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/tests/clinvar_xml_io/test_clinvar_record.py b/tests/clinvar_xml_io/test_clinvar_record.py index a727560d..282dfda7 100644 --- a/tests/clinvar_xml_io/test_clinvar_record.py +++ b/tests/clinvar_xml_io/test_clinvar_record.py @@ -35,6 +35,8 @@ def test_multiple_clinical_classifications_record(): class TestClinvarRecord: + """Tests base class as well as reference record""" + @classmethod def setup_class(cls): input_file = os.path.join(resources_dir, 'clinvar_dataset_v2.xml.gz')