Skip to content

Commit

Permalink
filtering - WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
apriltuesday committed Sep 4, 2024
1 parent 86cc604 commit b014d60
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
7 changes: 6 additions & 1 deletion cmat/clinvar_xml_io/clinvar_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from datetime import date

from cmat.clinvar_xml_io.clinvar_reference_record import ClinVarReferenceRecord
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes
from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml, parse_header_attributes, iterate_cvs_from_xml

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -22,6 +23,10 @@ def __iter__(self):
for rcv in iterate_rcv_from_xml(self.clinvar_xml):
yield ClinVarReferenceRecord(rcv, self.xsd_version)

def iter_cvs(self):
for cvs in iterate_cvs_from_xml(self.clinvar_xml):
yield ClinVarSet(cvs, self.xsd_version)

def get_xsd_version(self):
# For format, see https://github.com/ncbi/clinvar/blob/master/FTPSiteXsdChanges.md
if 'xsi:noNamespaceSchemaLocation' in self.header_attr:
Expand Down
1 change: 1 addition & 0 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings

logger.info('Processing ClinVar records')
i = -1
# TODO filter here
for clinvar_record in ClinVarDataset(clinvar_xml):
# If start & end provided, only process records in the range [start, end)
i += 1
Expand Down
19 changes: 17 additions & 2 deletions cmat/trait_mapping/trait_names_parsing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from collections import Counter
from typing import Iterable

from cmat import clinvar_xml_io
from cmat.clinvar_xml_io import ClinVarDataset
from cmat.clinvar_xml_io.clinvar_set import ClinVarSet
from cmat.trait_mapping.trait import Trait


Expand All @@ -27,7 +29,12 @@ def parse_trait_names(filepath: str) -> list:
# Their curation is of highest importance regardless of how many records they are actually associated with.
nt_expansion_traits = set()

for clinvar_record in clinvar_xml_io.ClinVarDataset(filepath):
dataset = ClinVarDataset(filepath)
for clinvar_set in dataset.iter_cvs():
# TODO where to put this logic (both the method & the exclusion list)?
if should_exclude_record(clinvar_set, ['SUB14299258']):
continue
clinvar_record = clinvar_set.rcv
trait_names_and_ids = set((trait.preferred_or_other_valid_name.lower(), trait.identifier)
for trait in clinvar_record.traits_with_valid_names)
for trait_tuple in trait_names_and_ids:
Expand All @@ -46,3 +53,11 @@ def parse_trait_names(filepath: str) -> list:
associated_with_nt_expansion=associated_with_nt_expansion))

return traits


def should_exclude_record(clinvar_set: ClinVarSet, names_to_exclude: Iterable) -> bool:
"""Return True if every submitted record in the set has submission_name in the exclusion list."""
for submitted_record in clinvar_set.scvs:
if submitted_record.submission_name not in names_to_exclude:
return False
return True

0 comments on commit b014d60

Please sign in to comment.