Skip to content

Commit

Permalink
fix: deal with undefined XRef (#288)
Browse files Browse the repository at this point in the history
  • Loading branch information
tedil authored Feb 28, 2025
1 parent 67eade9 commit 9745f36
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 28 deletions.
6 changes: 3 additions & 3 deletions clinvar_data/conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from clinvar_data.conversion.normalizer import VariationArchiveNormalizer
from clinvar_data.pbs import clinvar_public

#: Total number of VariantArchive records in ClinVar on 2024-05-24: 2966486.
TOTAL_RECORDS: int = 3_000_000
#: Total number of VariantArchive records in ClinVar on 2025-02-18: 3335626.
TOTAL_RECORDS: int = 3_335_626


def convert_variation_archive(json_va: dict) -> clinvar_public.VariationArchive:
Expand Down Expand Up @@ -51,7 +51,7 @@ def convert(
pb: tqdm.tqdm | None = None
if show_progress:
pb = tqdm.tqdm(
desc="parsing", unit=" VariationArchive records", smoothing=1.0, total=TOTAL_RECORDS
desc="parsing", unit=" VariationArchive records", smoothing=0.001, total=TOTAL_RECORDS
)
records_written = 0
errors = 0
Expand Down
54 changes: 29 additions & 25 deletions clinvar_data/conversion/dict_to_pb.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,10 +856,12 @@ def parse_citations_xrefs_comments(cls, value: dict[str, Any]) -> CitationsXrefs
xrefs: list[Xref] | None = None
if "XRef" not in value:
pass
elif isinstance(value["XRef"], list):
xrefs = [ConvertXref.xmldict_data_to_pb({"XRef": entry}) for entry in value["XRef"]]
elif isinstance(value["XRef"], dict):
xrefs = [ConvertXref.xmldict_data_to_pb({"XRef": value["XRef"]})]
elif isinstance(value.get("XRef", []), list):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry}) for entry in value.get("XRef", [])
]
elif isinstance(value.get("XRef", {}), dict):
xrefs = [ConvertXref.xmldict_data_to_pb({"XRef": value.get("XRef", {})})]
else:
assert False, f"Invalid type for XRef {value['XRef']}"
# parse out comments
Expand Down Expand Up @@ -948,8 +950,8 @@ def xmldict_data_to_pb(cls, value: dict[str, Any]) -> Xref:
The ``Xref`` protobuf.
"""
assert "XRef" in value
assert isinstance(value["XRef"], dict)
tag_xref: dict[str, str] = value["XRef"]
assert isinstance(value.get("XRef", {}), dict)
tag_xref: dict[str, str] = value.get("XRef", {})
cls.assert_keys(tag_xref, ["@ID", "@DB"])
if "@Status" in tag_xref:
status = ConvertStatus.xmldict_data_to_pb(tag_xref["@Status"])
Expand Down Expand Up @@ -2417,7 +2419,7 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> GeneralCitations:
if "XRef" in tag_function_consequence:
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_function_consequence["XRef"])
for entry in cls.ensure_list(tag_function_consequence.get("XRef", []))
]

return GeneralCitations(
Expand Down Expand Up @@ -2747,7 +2749,7 @@ def xmldict_data_to_pb(cls, value: dict[str, Any]) -> Location:
if "XRef" in tag_location:
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_location["XRef"])
for entry in cls.ensure_list(tag_location.get("XRef", []))
]

return Location(
Expand Down Expand Up @@ -3324,7 +3326,7 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> Method:
if "XRef" in tag_method:
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_method["XRef"])
for entry in cls.ensure_list(tag_method.get("XRef", []))
]
description: str | None = None
if "Description" in tag_method:
Expand Down Expand Up @@ -3400,7 +3402,7 @@ def convert_gene(cls, tag: dict[str, Any]) -> AlleleScv.Gene:
if "XRef" in tag_gene:
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_gene["XRef"])
for entry in cls.ensure_list(tag_gene.get("XRef", []))
]
symbol: str | None = None
if "@Symbol" in tag_gene:
Expand Down Expand Up @@ -3509,15 +3511,17 @@ def xmldict_data_to_pb(cls, value: dict[str, Any]) -> AlleleScv: # noqa: C901
assert False, f"Invalid type for Citation {tag_sa['Citation']}"
# parse out xrefs
xrefs: list[Xref] | None = None
if "XRefList" not in tag_sa or "XRef" not in tag_sa["XRefList"]:
if "XRefList" not in tag_sa or "XRef" not in (tag_sa["XRefList"] or {}):
pass
elif isinstance(tag_sa["XRefList"]["XRef"], list):
elif isinstance((tag_sa["XRefList"] or {}).get("XRef", []), list):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in tag_sa["XRefList"]["XRef"]
for entry in (tag_sa["XRefList"] or {}).get("XRef", [])
]
elif isinstance((tag_sa["XRefList"] or {}).get("XRef", {}), dict):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": (tag_sa["XRefList"] or {}).get("XRef", {})})
]
elif isinstance(tag_sa["XRefList"]["XRef"], dict):
xrefs = [ConvertXref.xmldict_data_to_pb({"XRef": tag_sa["XRefList"]["XRef"]})]
else:
assert False, f"Invalid type for XRef {tag_sa['XRef']}"
# parse out comments
Expand Down Expand Up @@ -3628,10 +3632,10 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> HaplotypeScv:
for entry in cls.ensure_list(tag_genotype["CitationList"]["Citation"])
]
xrefs: list[Xref] | None = None
if "XRefList" in tag_genotype and "XRef" in tag_genotype["XRefList"]:
if "XRefList" in tag_genotype and "XRef" in (tag_genotype["XRefList"] or {}):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_genotype["XRefList"]["XRef"])
for entry in cls.ensure_list((tag_genotype["XRefList"] or {}).get("XRef", []))
]
comments: list[Comment] | None = None
if "Comment" in tag_genotype:
Expand Down Expand Up @@ -3722,10 +3726,10 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> GenotypeScv:
for entry in cls.ensure_list(tag_genotype["CitationList"]["Citation"])
]
xrefs: list[Xref] | None = None
if "XRefList" in tag_genotype and "XRef" in tag_genotype["XRefList"]:
if "XRefList" in tag_genotype and "XRef" in (tag_genotype["XRefList"] or {}):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_genotype["XRefList"]["XRef"])
for entry in cls.ensure_list((tag_genotype["XRefList"] or {}).get("XRef", []))
]
comments: list[Comment] | None = None
if "Comment" in tag_genotype:
Expand Down Expand Up @@ -4395,10 +4399,10 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> Allele:
{"Classifications": tag_allele["Classifications"]}
)
xrefs: list[Xref] | None = None
if "XRefList" in tag_allele and "XRef" in tag_allele["XRefList"]:
if "XRefList" in tag_allele and "XRef" in (tag_allele["XRefList"] or {}):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_allele["XRefList"]["XRef"])
for entry in cls.ensure_list((tag_allele["XRefList"] or {}).get("XRef", []))
]
comments: list[Comment] | None = None
if "Comment" in tag_allele:
Expand Down Expand Up @@ -4496,10 +4500,10 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> Haplotype:
for entry in cls.ensure_list(tag_haplotype["FunctionalConsequence"])
]
xrefs: list[Xref] | None = None
if "XRefList" in tag_haplotype and "XRef" in tag_haplotype["XRefList"]:
if "XRefList" in tag_haplotype and "XRef" in (tag_haplotype["XRefList"] or {}):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_haplotype["XRefList"]["XRef"])
for entry in cls.ensure_list((tag_haplotype["XRefList"] or {}).get("XRef", []))
]
comments: list[Comment] | None = None
if "Comment" in tag_haplotype:
Expand Down Expand Up @@ -4668,10 +4672,10 @@ def xmldict_data_to_pb(cls, tag: dict[str, Any]) -> Genotype: # noqa: C901
{"Classifications": tag_record["Classifications"]}
)
xrefs: list[Xref] | None = None
if "XRefList" in tag_record and "XRef" in tag_record["XRefList"]:
if "XRefList" in tag_record and "XRef" in (tag_record["XRefList"] or {}):
xrefs = [
ConvertXref.xmldict_data_to_pb({"XRef": entry})
for entry in cls.ensure_list(tag_record["XRefList"]["XRef"])
for entry in cls.ensure_list((tag_record["XRefList"] or {}).get("XRef", []))
]
citations: list[Citation] | None = None
if "CitationList" in tag_record and "Citation" in tag_record["CitationList"]:
Expand Down

0 comments on commit 9745f36

Please sign in to comment.