Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: move ViccNormalizerDataExtension to mappings #425

Merged
merged 40 commits into from
Feb 14, 2025
Merged
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
cc19f94
wip: initial messy work
korikuzma Dec 23, 2024
d0a26d6
wip: update normalizers
korikuzma Jan 6, 2025
2262bc5
build: bump variation normalizer
korikuzma Jan 7, 2025
9ec5165
wip: add subptype to civic method
korikuzma Jan 8, 2025
0ebb8a6
wip: evidence level to vicc concept should include mappings
korikuzma Jan 9, 2025
42b4e6c
wip: messy initial work for supporting civic assertions
korikuzma Jan 9, 2025
e1948ed
wip
korikuzma Jan 21, 2025
b477340
wip
korikuzma Jan 21, 2025
14a8cf5
revert strength (save for #419)
korikuzma Jan 21, 2025
9390433
revert mane_genes (not needed anymore for varcat)
korikuzma Jan 21, 2025
441283a
fix tests
korikuzma Jan 23, 2025
2d88fb6
bump cat_vrs
korikuzma Jan 23, 2025
d7420ae
missed renames
korikuzma Jan 23, 2025
2553f46
update ga4gh va spec imports
korikuzma Jan 23, 2025
4633157
update coding
korikuzma Jan 24, 2025
6a0ae3a
use va spec gh
korikuzma Jan 24, 2025
c0a6cc4
Merge branch 'issue-415' into issue-240
korikuzma Jan 24, 2025
65dc2a3
style: resolve ruff errors
korikuzma Jan 24, 2025
75931bb
Merge branch 'resolve-ruff' into issue-415
korikuzma Jan 24, 2025
3fdcc58
Merge branch 'issue-415' into issue-240
korikuzma Jan 24, 2025
f81c98c
discard null variant
korikuzma Jan 24, 2025
1d1c2bb
fix ci
korikuzma Jan 24, 2025
62b564a
Merge branch 'issue-415' into issue-240
korikuzma Jan 24, 2025
9e1596c
strength is mappable concept, not coding
korikuzma Jan 24, 2025
3c1e1f5
Merge branch 'issue-415' into issue-240
korikuzma Jan 24, 2025
07c2bce
rm coding prefix
korikuzma Jan 24, 2025
e761a2f
Merge branch 'issue-415' into issue-240
korikuzma Jan 24, 2025
7c0db36
wip: update db
korikuzma Jan 25, 2025
e17272a
wip: update query
korikuzma Jan 26, 2025
00d369f
fix transformer tests
korikuzma Jan 26, 2025
65804f7
Merge branch 'issue-240' into issue-419
korikuzma Jan 27, 2025
face8b8
update
korikuzma Jan 27, 2025
38d46e1
revert
korikuzma Jan 27, 2025
77abcab
cleanup
korikuzma Jan 27, 2025
df10969
`ViccNormalizerDataExtension` should be moved to `mappings`
korikuzma Jan 27, 2025
704602a
update tests
korikuzma Jan 27, 2025
53770c7
cleanup
korikuzma Jan 27, 2025
0d3113a
updates needed for varcat
korikuzma Feb 5, 2025
a012460
docs
korikuzma Feb 5, 2025
8d4cb8e
Merge branch 'staging' into issue-418
korikuzma Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 25 additions & 19 deletions src/metakb/load_data.py
Original file line number Diff line number Diff line change
@@ -8,8 +8,7 @@
from neo4j import Driver, ManagedTransaction

from metakb.database import get_driver
from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData
from metakb.transformers.base import TherapyType
from metakb.transformers.base import NORMALIZER_PRIORITY_EXT_NAME, TherapyType

_logger = logging.getLogger(__name__)

@@ -33,34 +32,41 @@ def _create_parameterized_query(
def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None:
"""Get mappings and extensions from object and add to `obj` and `obj_keys`

:param obj: Object to update with mappings and extensions (if found)
:param obj: Object to update with mappings and extensions (if found).
If ``obj`` has Disease, Gene, or Therapy ``conceptType``, then ``normalizer_id``
will also be added.
:param obj_keys: Parameterized queries. This will be mutated if mappings and
extensions exists
"""
mappings = obj.get("mappings", [])
if mappings:
concept_type = obj.get("conceptType")
if concept_type in {"Disease", "Gene", "Therapy"}:
normalizer_id = None
for mapping in obj["mappings"]:
extensions = mapping.get("extensions") or []
for ext in extensions:
if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]:
normalizer_id = mapping["coding"]["code"]
obj["normalizer_id"] = normalizer_id
obj_keys.append("normalizer_id:$normalizer_id")
break

if normalizer_id:
break

obj["mappings"] = json.dumps(mappings)
obj_keys.append("mappings:$mappings")

extensions = obj.get("extensions", [])
for ext in extensions:
if ext["name"] == VICC_NORMALIZER_DATA:
for normalized_field in ViccDiseaseNormalizerData.model_fields:
normalized_val = ext["value"].get(normalized_field)
if normalized_val is None:
continue

name = f"normalizer_{normalized_field}"
obj[name] = normalized_val
obj_keys.append(f"{name}:${name}")
name = "_".join(ext["name"].split()).lower()
val = ext["value"]
if isinstance(val, (dict | list)):
obj[name] = json.dumps(val)
else:
name = "_".join(ext["name"].split()).lower()
val = ext["value"]
if isinstance(val, (dict | list)):
obj[name] = json.dumps(val)
else:
obj[name] = val
obj_keys.append(f"{name}:${name}")
obj[name] = val
obj_keys.append(f"{name}:${name}")


def _add_method(tx: ManagedTransaction, method: dict, ids_in_stmts: set[str]) -> None:
24 changes: 0 additions & 24 deletions src/metakb/normalizers.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,6 @@
import os
from collections.abc import Iterable
from enum import Enum
from typing import Literal

from botocore.exceptions import TokenRetrievalError
from disease.cli import update_db as update_disease_db
@@ -23,7 +22,6 @@
from gene.database.database import AWS_ENV_VAR_NAME as GENE_AWS_ENV_VAR_NAME
from gene.query import QueryHandler as GeneQueryHandler
from gene.schemas import NormalizeService as NormalizedGene
from pydantic import BaseModel
from therapy.cli import update_normalizer_db as update_therapy_db
from therapy.database import create_db as create_therapy_db
from therapy.database.database import AWS_ENV_VAR_NAME as THERAPY_AWS_ENV_VAR_NAME
@@ -44,28 +42,6 @@
_logger = logging.getLogger(__name__)


class ViccNormalizerData(BaseModel, extra="forbid"):
"""Define model for representing VICC normalizer data"""

id: str
label: str


class ViccDiseaseNormalizerData(ViccNormalizerData, extra="forbid"):
"""Define model for representing VICC disease normalizer data"""

mondo_id: str | None = None


VICC_NORMALIZER_DATA = "vicc_normalizer_data"


class ViccNormalizerDataExtension(Extension):
"""Define model for representing VICC normalizer data as an Extension"""

name: Literal["vicc_normalizer_data"] = VICC_NORMALIZER_DATA


class ViccNormalizers:
"""Manage VICC concept normalization services.

40 changes: 11 additions & 29 deletions src/metakb/query.py
Original file line number Diff line number Diff line change
@@ -20,9 +20,6 @@

from metakb.database import get_driver
from metakb.normalizers import (
ViccDiseaseNormalizerData,
ViccNormalizerData,
ViccNormalizerDataExtension,
ViccNormalizers,
)
from metakb.schemas.api import (
@@ -577,41 +574,23 @@ def _get_nested_stmt(

return PROP_TYPE_TO_CLASS[prop_type](**params)

@staticmethod
def _get_vicc_normalizer_extension(node: dict) -> ViccNormalizerDataExtension:
"""Get VICC Normalizer extension data

:param node: Therapy, disease, or gene node data
:return: VICC Normalizer extension data
"""
params = {
"id": node["normalizer_id"],
"label": node["normalizer_label"],
}

if node["conceptType"] == "Disease":
params["mondo_id"] = node.get("normalizer_mondo_id")
ext_val = ViccDiseaseNormalizerData(**params)
else:
ext_val = ViccNormalizerData(**params)

return ViccNormalizerDataExtension(value=ext_val.model_dump())

def _get_disease(self, node: dict) -> MappableConcept:
"""Get disease data from a node with relationship ``HAS_TUMOR_TYPE``

:param node: Disease node data
:return: Disease mappable concept object
"""
node["mappings"] = _deserialize_field(node, "mappings")
extensions = [self._get_vicc_normalizer_extension(node)]
extensions = []
descr = node.get("description")
if descr:
extensions.append(Extension(name="description", value=descr))
aliases = node.get("aliases")
if aliases:
extensions.append(Extension(name="aliases", value=json.loads(aliases)))
node["extensions"] = extensions

if extensions:
node["extensions"] = extensions
return MappableConcept(**node)

def _get_variations(self, cv_id: str, relation: VariationRelation) -> list[dict]:
@@ -732,15 +711,16 @@ def _get_gene_context_qualifier(self, statement_id: str) -> MappableConcept | No

gene_node = results.records[0].data()["g"]
gene_node["mappings"] = _deserialize_field(gene_node, "mappings")
extensions = [self._get_vicc_normalizer_extension(gene_node)]
extensions = []
descr = gene_node.get("description")
if descr:
extensions.append(Extension(name="description", value=descr))
aliases = gene_node.get("aliases")
if aliases:
extensions.append(Extension(name="aliases", value=json.loads(aliases)))

gene_node["extensions"] = extensions
if extensions:
gene_node["extensions"] = extensions
return MappableConcept(**gene_node)

def _get_method_document(self, method_id: str) -> Document | None:
@@ -896,7 +876,7 @@ def _get_therapy(self, in_ta_params: dict) -> MappableConcept:
"""
ta_params = copy(in_ta_params)
ta_params["mappings"] = _deserialize_field(ta_params, "mappings")
extensions = [self._get_vicc_normalizer_extension(ta_params)]
extensions = []
regulatory_approval = ta_params.get("regulatory_approval")
if regulatory_approval:
regulatory_approval = json.loads(regulatory_approval)
@@ -906,7 +886,9 @@ def _get_therapy(self, in_ta_params: dict) -> MappableConcept:
aliases = ta_params.get("aliases")
if aliases:
extensions.append(Extension(name="aliases", value=json.loads(aliases)))
ta_params["extensions"] = extensions

if extensions:
ta_params["extensions"] = extensions
return MappableConcept(**ta_params)

async def batch_search_statements(
107 changes: 80 additions & 27 deletions src/metakb/transformers/base.py
Original file line number Diff line number Diff line change
@@ -8,9 +8,6 @@
from pathlib import Path
from typing import ClassVar

from disease.schemas import (
SYSTEM_URI_TO_NAMESPACE as DISEASE_SYSTEM_URI_TO_NAMESPACE,
)
from disease.schemas import (
NamespacePrefix as DiseaseNamespacePrefix,
)
@@ -34,16 +31,18 @@
)
from ga4gh.va_spec.base import Document, Method, TherapyGroup
from ga4gh.vrs.models import Allele
from gene.schemas import NormalizeService as NormalizedGene
from gene.schemas import (
NamespacePrefix as GeneNamespacePrefix,
)
from gene.schemas import (
NormalizeService as NormalizedGene,
)
from pydantic import BaseModel, Field, StrictStr, ValidationError
from therapy.schemas import NormalizationService as NormalizedTherapy

from metakb import APP_ROOT, DATE_FMT
from metakb.harvesters.base import _HarvestedData
from metakb.normalizers import (
ViccDiseaseNormalizerData,
ViccNormalizerData,
ViccNormalizerDataExtension,
ViccNormalizers,
)
from metakb.schemas.app import SourceName
@@ -57,6 +56,9 @@
NormalizedGene: "gene",
}

# Normalizer priority extension name
NORMALIZER_PRIORITY_EXT_NAME = "vicc_normalizer_priority"


class EcoLevel(str, Enum):
"""Define constraints for Evidence Ontology levels"""
@@ -112,6 +114,14 @@ class ViccConceptVocab(BaseModel):
definition: StrictStr


class _TransformedRecordsCache(BaseModel):
"""Define model for caching transformed records"""

therapies: ClassVar[dict[str, MappableConcept]] = {}
conditions: ClassVar[dict[str, MappableConcept]] = {}
genes: ClassVar[dict[str, MappableConcept]] = {}


class TransformedData(BaseModel):
"""Define model for transformed data"""

@@ -521,39 +531,82 @@ def _add_therapy(
if therapy:
self.able_to_normalize["therapies"][therapy_id] = therapy
self.processed_data.therapies.append(therapy)
else:
self.unable_to_normalize["therapies"].add(therapy_id)

return therapy

@staticmethod
def _get_vicc_normalizer_extension(
def _get_vicc_normalizer_mappings(
normalized_id: str,
normalizer_resp: NormalizedDisease | NormalizedTherapy | NormalizedGene,
) -> ViccNormalizerDataExtension:
"""Get VICC Normalizer extension data
) -> list[ConceptMapping]:
"""Get VICC Normalizer mappable concept

:param normalized_id: Normalized ID from VICC normalizer
:param normalizer_resp: Response from VICC normalizer
:return: VICC Normalizer extension data
:return: List of VICC Normalizer data represented as mappable concept
"""
attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)]
normalizer_resp_obj = getattr(normalizer_resp, attr_name)

params = {"id": normalized_id, "label": normalizer_resp_obj.label}
def _update_mapping(
mapping: ConceptMapping,
normalized_id: str,
normalizer_label: str,
) -> Extension:
"""Update ``mapping`` to include extension on whether ``mapping`` contains
code that matches the merged record's primary identifier.

:param mapping: ConceptMapping from vicc normalizer. This will be mutated.
Extensions will be added. Label will be added if mapping identifier
matches normalized merged identifier.
:param normalized_id: Concept ID from normalized record
:param normalizer_label: Label from normalized record
:return: ConceptMapping with normalizer extension added as well as label (
if mapping id matches normalized merged id)
"""
is_priority = normalized_id == mapping.coding.code.root

if isinstance(normalizer_resp, NormalizedDisease):
mappings = normalizer_resp_obj.mappings or []
for mapping in mappings:
merged_id_ext = Extension(
name=NORMALIZER_PRIORITY_EXT_NAME, value=is_priority
)
if mapping.extensions:
mapping.extensions.append(merged_id_ext)
else:
mapping.extensions = [merged_id_ext]

if is_priority:
mapping.coding.label = normalizer_label

return mapping

mappings: list[ConceptMapping] = []
attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)]
normalizer_resp_obj = getattr(normalizer_resp, attr_name)
normalizer_label = normalizer_resp_obj.label
is_disease = isinstance(normalizer_resp, NormalizedDisease)
is_gene = isinstance(normalizer_resp, NormalizedGene)

normalizer_mappings = normalizer_resp_obj.mappings or []
for mapping in normalizer_mappings:
if normalized_id == mapping.coding.code.root:
mappings.append(
_update_mapping(mapping, normalized_id, normalizer_label)
)
else:
mapping_code_lower = mapping.coding.code.root.lower()
if (
DISEASE_SYSTEM_URI_TO_NAMESPACE.get(mapping.coding.system)
== DiseaseNamespacePrefix.MONDO.value
is_disease
and mapping_code_lower.startswith(
DiseaseNamespacePrefix.MONDO.value
)
) or (
is_gene
and mapping_code_lower.startswith(
(GeneNamespacePrefix.NCBI.value, GeneNamespacePrefix.HGNC.value)
)
):
params["mondo_id"] = mapping.coding.code.root
break
ext_val = ViccDiseaseNormalizerData(**params)
else:
ext_val = ViccNormalizerData(**params)
return ViccNormalizerDataExtension(value=ext_val.model_dump())
mappings.append(
_update_mapping(mapping, normalized_id, normalizer_label)
)
return mappings

def create_json(self, cdm_filepath: Path | None = None) -> None:
"""Create a composite JSON for transformed data.
Loading