From df10969cee8d64e95eeb2d8def525905ec477f6d Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Mon, 27 Jan 2025 18:28:10 -0500 Subject: [PATCH 1/5] `ViccNormalizerDataExtension` should be moved to `mappings` --- src/metakb/load_data.py | 51 ++-- src/metakb/normalizers.py | 24 -- src/metakb/query.py | 40 +-- src/metakb/transformers/base.py | 73 +++-- src/metakb/transformers/civic.py | 51 ++-- src/metakb/transformers/moa.py | 28 +- tests/conftest.py | 274 ++++++++++++------ tests/unit/database/test_database.py | 36 +-- tests/unit/search/test_search_statements.py | 17 +- .../test_civic_transformer_diagnostic.py | 79 +++-- .../test_moa_transformer_prognostic.py | 58 ++-- .../test_moa_transformer_therapeutic.py | 53 ++-- 12 files changed, 456 insertions(+), 328 deletions(-) diff --git a/src/metakb/load_data.py b/src/metakb/load_data.py index b8f254a4..658ccf4c 100644 --- a/src/metakb/load_data.py +++ b/src/metakb/load_data.py @@ -8,8 +8,7 @@ from neo4j import Driver, ManagedTransaction from metakb.database import get_driver -from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData -from metakb.transformers.base import TherapyType +from metakb.transformers.base import NORMALIZER_PRIORITY_EXT_NAME, TherapyType _logger = logging.getLogger(__name__) @@ -44,23 +43,13 @@ def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None: extensions = obj.get("extensions", []) for ext in extensions: - if ext["name"] == VICC_NORMALIZER_DATA: - for normalized_field in ViccDiseaseNormalizerData.model_fields: - normalized_val = ext["value"].get(normalized_field) - if normalized_val is None: - continue - - name = f"normalizer_{normalized_field}" - obj[name] = normalized_val - obj_keys.append(f"{name}:${name}") + name = "_".join(ext["name"].split()).lower() + val = ext["value"] + if isinstance(val, (dict | list)): + obj[name] = json.dumps(val) else: - name = "_".join(ext["name"].split()).lower() - val = ext["value"] - if isinstance(val, (dict | list)): - obj[name] = json.dumps(val) - else: - obj[name] = val - obj_keys.append(f"{name}:${name}") + obj[name] = val + obj_keys.append(f"{name}:${name}") def _add_method(tx: ManagedTransaction, method: dict, ids_in_stmts: set[str]) -> None: @@ -99,6 +88,26 @@ def _add_method(tx: ManagedTransaction, method: dict, ids_in_stmts: set[str]) -> tx.run(query, **method) +def _add_normalizer_id_to_obj(obj: dict, obj_keys: list[str]) -> None: + """Get normalizer ID and add to ``obj`` and ``obj_keys`` + + :param obj: Object to update with ``normalizer_id`` + :param obj_keys: Parameterized queries. This will be mutated. + """ + normalizer_id = None + for mapping in obj["mappings"]: + extensions = mapping.get("extensions") + if extensions: + for ext in extensions: + if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]: + normalizer_id = mapping["coding"]["code"] + break + + if normalizer_id: + obj["normalizer_id"] = normalizer_id + obj_keys.append("normalizer_id:$normalizer_id") + + def _add_gene_or_disease( tx: ManagedTransaction, obj_in: dict, ids_in_stmts: set[str] ) -> None: @@ -122,6 +131,9 @@ def _add_gene_or_disease( obj["conceptType"] = obj_type obj_keys = [_create_parameterized_query(obj, ("id", "label", "conceptType"))] + _add_normalizer_id_to_obj( + obj, obj_keys + ) # must be before _add_mappings_and_exts_to_obj _add_mappings_and_exts_to_obj(obj, obj_keys) obj_keys = ", ".join(obj_keys) @@ -192,6 +204,9 @@ def _add_therapy(tx: ManagedTransaction, therapy_in: dict) -> None: _create_parameterized_query(therapy, ("id", "label", "conceptType")) ] + _add_normalizer_id_to_obj( + therapy, nonnull_keys + ) # must be before _add_mappings_and_exts_to_obj _add_mappings_and_exts_to_obj(therapy, nonnull_keys) nonnull_keys = ", ".join(nonnull_keys) diff --git a/src/metakb/normalizers.py b/src/metakb/normalizers.py index d99e3a70..7a69fe7e 100644 --- a/src/metakb/normalizers.py +++ b/src/metakb/normalizers.py @@ -4,7 +4,6 @@ import os from collections.abc import Iterable from enum import Enum -from typing import Literal from botocore.exceptions import TokenRetrievalError from disease.cli import update_db as update_disease_db @@ -23,7 +22,6 @@ from gene.database.database import AWS_ENV_VAR_NAME as GENE_AWS_ENV_VAR_NAME from gene.query import QueryHandler as GeneQueryHandler from gene.schemas import NormalizeService as NormalizedGene -from pydantic import BaseModel from therapy.cli import update_normalizer_db as update_therapy_db from therapy.database import create_db as create_therapy_db from therapy.database.database import AWS_ENV_VAR_NAME as THERAPY_AWS_ENV_VAR_NAME @@ -44,28 +42,6 @@ _logger = logging.getLogger(__name__) -class ViccNormalizerData(BaseModel, extra="forbid"): - """Define model for representing VICC normalizer data""" - - id: str - label: str - - -class ViccDiseaseNormalizerData(ViccNormalizerData, extra="forbid"): - """Define model for representing VICC disease normalizer data""" - - mondo_id: str | None = None - - -VICC_NORMALIZER_DATA = "vicc_normalizer_data" - - -class ViccNormalizerDataExtension(Extension): - """Define model for representing VICC normalizer data as an Extension""" - - name: Literal["vicc_normalizer_data"] = VICC_NORMALIZER_DATA - - class ViccNormalizers: """Manage VICC concept normalization services. diff --git a/src/metakb/query.py b/src/metakb/query.py index 81abf89c..d1b5d468 100644 --- a/src/metakb/query.py +++ b/src/metakb/query.py @@ -20,9 +20,6 @@ from metakb.database import get_driver from metakb.normalizers import ( - ViccDiseaseNormalizerData, - ViccNormalizerData, - ViccNormalizerDataExtension, ViccNormalizers, ) from metakb.schemas.api import ( @@ -577,26 +574,6 @@ def _get_nested_stmt( return PROP_TYPE_TO_CLASS[prop_type](**params) - @staticmethod - def _get_vicc_normalizer_extension(node: dict) -> ViccNormalizerDataExtension: - """Get VICC Normalizer extension data - - :param node: Therapy, disease, or gene node data - :return: VICC Normalizer extension data - """ - params = { - "id": node["normalizer_id"], - "label": node["normalizer_label"], - } - - if node["conceptType"] == "Disease": - params["mondo_id"] = node.get("normalizer_mondo_id") - ext_val = ViccDiseaseNormalizerData(**params) - else: - ext_val = ViccNormalizerData(**params) - - return ViccNormalizerDataExtension(value=ext_val.model_dump()) - def _get_disease(self, node: dict) -> MappableConcept: """Get disease data from a node with relationship ``HAS_TUMOR_TYPE`` @@ -604,14 +581,16 @@ def _get_disease(self, node: dict) -> MappableConcept: :return: Disease mappable concept object """ node["mappings"] = _deserialize_field(node, "mappings") - extensions = [self._get_vicc_normalizer_extension(node)] + extensions = [] descr = node.get("description") if descr: extensions.append(Extension(name="description", value=descr)) aliases = node.get("aliases") if aliases: extensions.append(Extension(name="aliases", value=json.loads(aliases))) - node["extensions"] = extensions + + if extensions: + node["extensions"] = extensions return MappableConcept(**node) def _get_variations(self, cv_id: str, relation: VariationRelation) -> list[dict]: @@ -732,7 +711,7 @@ def _get_gene_context_qualifier(self, statement_id: str) -> MappableConcept | No gene_node = results.records[0].data()["g"] gene_node["mappings"] = _deserialize_field(gene_node, "mappings") - extensions = [self._get_vicc_normalizer_extension(gene_node)] + extensions = [] descr = gene_node.get("description") if descr: extensions.append(Extension(name="description", value=descr)) @@ -740,7 +719,8 @@ def _get_gene_context_qualifier(self, statement_id: str) -> MappableConcept | No if aliases: extensions.append(Extension(name="aliases", value=json.loads(aliases))) - gene_node["extensions"] = extensions + if extensions: + gene_node["extensions"] = extensions return MappableConcept(**gene_node) def _get_method_document(self, method_id: str) -> Document | None: @@ -896,7 +876,7 @@ def _get_therapy(self, in_ta_params: dict) -> MappableConcept: """ ta_params = copy(in_ta_params) ta_params["mappings"] = _deserialize_field(ta_params, "mappings") - extensions = [self._get_vicc_normalizer_extension(ta_params)] + extensions = [] regulatory_approval = ta_params.get("regulatory_approval") if regulatory_approval: regulatory_approval = json.loads(regulatory_approval) @@ -906,7 +886,9 @@ def _get_therapy(self, in_ta_params: dict) -> MappableConcept: aliases = ta_params.get("aliases") if aliases: extensions.append(Extension(name="aliases", value=json.loads(aliases))) - ta_params["extensions"] = extensions + + if extensions: + ta_params["extensions"] = extensions return MappableConcept(**ta_params) async def batch_search_statements( diff --git a/src/metakb/transformers/base.py b/src/metakb/transformers/base.py index e5d9ff55..f49a8802 100644 --- a/src/metakb/transformers/base.py +++ b/src/metakb/transformers/base.py @@ -41,9 +41,6 @@ from metakb import APP_ROOT, DATE_FMT from metakb.harvesters.base import _HarvestedData from metakb.normalizers import ( - ViccDiseaseNormalizerData, - ViccNormalizerData, - ViccNormalizerDataExtension, ViccNormalizers, ) from metakb.schemas.app import SourceName @@ -57,6 +54,9 @@ NormalizedGene: "gene", } +# Normalizer priority extension name +NORMALIZER_PRIORITY_EXT_NAME = "vicc_normalizer_priority" + class EcoLevel(str, Enum): """Define constraints for Evidence Ontology levels""" @@ -526,34 +526,73 @@ def _add_therapy( return therapy @staticmethod - def _get_vicc_normalizer_extension( + def _get_vicc_normalizer_mappings( normalized_id: str, normalizer_resp: NormalizedDisease | NormalizedTherapy | NormalizedGene, - ) -> ViccNormalizerDataExtension: - """Get VICC Normalizer extension data + ) -> list[ConceptMapping]: + """Get VICC Normalizer mappable concept :param normalized_id: Normalized ID from VICC normalizer :param normalizer_resp: Response from VICC normalizer - :return: VICC Normalizer extension data + :return: List of VICC Normalizer data represented as mappable concept """ - attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)] - normalizer_resp_obj = getattr(normalizer_resp, attr_name) - params = {"id": normalized_id, "label": normalizer_resp_obj.label} + def _add_merged_id_ext( + mapping: ConceptMapping, + is_priority: bool, + label: str | None = None, + ) -> Extension: + """Update ``mapping`` to include extension on whether mapping is from merged identifier + + :param mapping: ConceptMapping from vicc normalizer. This will be mutated. + :param is_priority: ``True`` if concept mapping contains primaryCode that + matches merged record primaryCode. ``False`` otherwise (meaning it comes + from merged record mappings) + :param label: Merged concept label, if found + :return: ConceptMapping with normalizer extension added + """ + merged_id_ext = Extension( + name=NORMALIZER_PRIORITY_EXT_NAME, value=is_priority + ) + if mapping.extensions: + mapping.extensions.append(merged_id_ext) + else: + mapping.extensions = [merged_id_ext] + + if label: + mapping.coding.label = label + return mapping + + mappings: list[ConceptMapping] = [] + attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)] + normalizer_resp_obj = getattr(normalizer_resp, attr_name) + normalizer_mappings = normalizer_resp_obj.mappings or [] if isinstance(normalizer_resp, NormalizedDisease): - mappings = normalizer_resp_obj.mappings or [] - for mapping in mappings: + for mapping in normalizer_mappings: if ( DISEASE_SYSTEM_URI_TO_NAMESPACE.get(mapping.coding.system) == DiseaseNamespacePrefix.MONDO.value ): - params["mondo_id"] = mapping.coding.code.root - break - ext_val = ViccDiseaseNormalizerData(**params) + mappings.append(_add_merged_id_ext(mapping, is_priority=False)) + else: + if normalized_id == mapping.coding.code.root: + mappings.append( + _add_merged_id_ext( + mapping, + label=normalizer_resp_obj.label, + is_priority=True, + ) + ) else: - ext_val = ViccNormalizerData(**params) - return ViccNormalizerDataExtension(value=ext_val.model_dump()) + mappings.extend( + _add_merged_id_ext( + mapping, label=normalizer_resp_obj.label, is_priority=True + ) + for mapping in normalizer_mappings + if normalized_id == mapping.coding.code.root + ) + return mappings def create_json(self, cdm_filepath: Path | None = None) -> None: """Create a composite JSON for transformed data. diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py index 9fa7c8df..a8c699e8 100644 --- a/src/metakb/transformers/civic.py +++ b/src/metakb/transformers/civic.py @@ -797,10 +797,19 @@ def _add_genes(self, genes: list[dict]) -> None: queries ) - extensions = [ - self._get_vicc_normalizer_extension(normalized_gene_id, gene_norm_resp) + mappings = [ + ConceptMapping( + coding=Coding( + id=ncbigene, + code=str(gene["entrez_id"]), + system="https://www.ncbi.nlm.nih.gov/gene/", + ), + relation=Relation.EXACT_MATCH, + ), + *self._get_vicc_normalizer_mappings(normalized_gene_id, gene_norm_resp), ] + extensions = [] if gene["aliases"]: extensions.append(Extension(name="aliases", value=gene["aliases"])) @@ -814,17 +823,8 @@ def _add_genes(self, genes: list[dict]) -> None: id=gene_id, conceptType="Gene", label=gene["name"], - mappings=[ - ConceptMapping( - coding=Coding( - id=ncbigene, - code=str(gene["entrez_id"]), - system="https://www.ncbi.nlm.nih.gov/gene/", - ), - relation=Relation.EXACT_MATCH, - ) - ], - extensions=extensions, + mappings=mappings, + extensions=extensions or None, ) self.able_to_normalize["genes"][gene_id] = civic_gene self.processed_data.genes.append(civic_gene) @@ -900,16 +900,11 @@ def _get_disease(self, disease: dict) -> MappableConcept | None: ) return None + mappings.extend( + self._get_vicc_normalizer_mappings(normalized_disease_id, disease_norm_resp) + ) return MappableConcept( - id=disease_id, - conceptType="Disease", - label=display_name, - mappings=mappings if mappings else None, - extensions=[ - self._get_vicc_normalizer_extension( - normalized_disease_id, disease_norm_resp - ) - ], + id=disease_id, conceptType="Disease", label=display_name, mappings=mappings ) def _get_therapeutic_substitute_group( @@ -1006,13 +1001,13 @@ def _get_therapy(self, therapy: dict) -> MappableConcept | None: regulatory_approval_extension = ( self.vicc_normalizers.get_regulatory_approval_extension(therapy_norm_resp) ) - - extensions = [ - self._get_vicc_normalizer_extension( + mappings.extend( + self._get_vicc_normalizer_mappings( normalized_therapeutic_id, therapy_norm_resp ) - ] + ) + extensions = [] if regulatory_approval_extension: extensions.append(regulatory_approval_extension) @@ -1023,8 +1018,8 @@ def _get_therapy(self, therapy: dict) -> MappableConcept | None: id=therapy_id, label=label, conceptType="Therapy", - mappings=mappings if mappings else None, - extensions=extensions, + mappings=mappings, + extensions=extensions or None, ) def _get_therapeutic_metadata( diff --git a/src/metakb/transformers/moa.py b/src/metakb/transformers/moa.py index bab7aec7..91101f8e 100644 --- a/src/metakb/transformers/moa.py +++ b/src/metakb/transformers/moa.py @@ -385,11 +385,9 @@ def _add_genes(self, genes: list[str]) -> None: id=f"moa.normalize.gene:{quote(gene)}", conceptType="Gene", label=gene, - extensions=[ - self._get_vicc_normalizer_extension( - normalized_gene_id, gene_norm_resp - ) - ], + mappings=self._get_vicc_normalizer_mappings( + normalized_gene_id, gene_norm_resp + ), ) self.able_to_normalize["genes"][quote(gene)] = moa_gene self.processed_data.genes.append(moa_gene) @@ -508,11 +506,7 @@ def _get_therapy(self, therapy: dict) -> MappableConcept | None: logger.debug("Therapy Normalizer unable to normalize: %s", therapy) return None - extensions = [ - self._get_vicc_normalizer_extension( - normalized_therapeutic_id, therapy_norm_resp - ) - ] + extensions = [] regulatory_approval_extension = ( self.vicc_normalizers.get_regulatory_approval_extension(therapy_norm_resp) @@ -525,7 +519,10 @@ def _get_therapy(self, therapy: dict) -> MappableConcept | None: id=f"moa.{therapy_norm_resp.therapy.id}", conceptType="Therapy", label=therapy["label"], - extensions=extensions, + mappings=self._get_vicc_normalizer_mappings( + normalized_therapeutic_id, therapy_norm_resp + ), + extensions=extensions or None, ) def _add_disease(self, disease: dict) -> dict | None: @@ -628,14 +625,13 @@ def _get_disease(self, disease: dict) -> MappableConcept | None: logger.debug("Disease Normalizer unable to normalize: %s", queries) return None + mappings.extend( + self._get_vicc_normalizer_mappings(normalized_disease_id, disease_norm_resp) + ) + return MappableConcept( id=f"moa.{disease_norm_resp.disease.id}", conceptType="Disease", label=disease_name, mappings=mappings if mappings else None, - extensions=[ - self._get_vicc_normalizer_extension( - normalized_disease_id, disease_norm_resp - ) - ], ) diff --git a/tests/conftest.py b/tests/conftest.py index 4acfea55..531047a2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ from deepdiff import DeepDiff from metakb.harvesters.base import Harvester -from metakb.normalizers import VICC_NORMALIZER_DATA, ViccNormalizers +from metakb.normalizers import ViccNormalizers from metakb.query import QueryHandler TEST_DATA_DIR = Path(__file__).resolve().parents[0] / "data" @@ -59,14 +59,47 @@ def check_source_harvest(tmp_path: Path, harvester: Harvester): assert not harvested_filepath.exists() +def get_vicc_normalizer_ext(is_priority: bool): + """Create test fixture for vicc normalizer priority extension""" + return [{"name": "vicc_normalizer_priority", "value": is_priority}] + + @pytest.fixture(scope="session") -def cetuximab_extensions(): - """Create test fixture for cetuximab extensions""" +def braf_normalizer_mappings(): + """Create test fixture for braf normalizer mappings""" return [ { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "rxcui:318341", "label": "cetuximab"}, + "coding": { + "label": "BRAF", + "code": "hgnc:1097", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), }, + ] + + +@pytest.fixture(scope="session") +def cetuximab_normalizer_mappings(): + """Create test fixture for cetuximab normalizer mappings""" + return [ + { + "coding": { + "label": "cetuximab", + "code": "rxcui:318341", + "system": "https://www.nlm.nih.gov/research/umls/rxnorm/index.html", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + } + ] + + +@pytest.fixture(scope="session") +def cetuximab_extensions(): + """Create test fixture for cetuximab extensions""" + return [ { "name": "regulatory_approval", "value": { @@ -135,16 +168,25 @@ def cetuximab_extensions(): @pytest.fixture(scope="session") -def encorafenib_extensions(): - """Create test fixture for encorafenib extensions""" +def encorafenib_normalizer_mappings(): + """Create test fixture for encorafenib normalizer mappings""" return [ { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "rxcui:2049106", + "coding": { "label": "encorafenib", + "code": "rxcui:2049106", + "system": "https://www.nlm.nih.gov/research/umls/rxnorm/index.html", }, - }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + } + ] + + +@pytest.fixture(scope="session") +def encorafenib_extensions(): + """Create test fixture for encorafenib extensions""" + return [ { "name": "regulatory_approval", "value": { @@ -396,7 +438,7 @@ def civic_eid2997_study_stmt( @pytest.fixture(scope="session") -def civic_gid5(): +def civic_gid5(braf_normalizer_mappings): """Create test fixture for CIViC GID5.""" return { "id": "civic.gid:5", @@ -410,17 +452,14 @@ def civic_gid5(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", - } + }, + *braf_normalizer_mappings, ], "extensions": [ { "name": "description", "value": "BRAF mutations are found to be recurrent in many cancer types. Of these, the mutation of valine 600 to glutamic acid (V600E) is the most prevalent. V600E has been determined to be an activating mutation, and cells that harbor it, along with other V600 mutations are sensitive to the BRAF inhibitor dabrafenib. It is also common to use MEK inhibition as a substitute for BRAF inhibitors, and the MEK inhibitor trametinib has seen some success in BRAF mutant melanomas. BRAF mutations have also been correlated with poor prognosis in many cancer types, although there is at least one study that questions this conclusion in papillary thyroid cancer.\n\nOncogenic BRAF mutations are divided into three categories that determine their sensitivity to inhibitors.\nClass 1 BRAF mutations (V600) are RAS-independent, signal as monomers and are sensitive to current RAF monomer inhibitors.\nClass 2 BRAF mutations (K601E, K601N, K601T, L597Q, L597V, G469A, G469V, G469R, G464V, G464E, and fusions) are RAS-independent, signaling as constitutive dimers and are resistant to vemurafenib. Such mutants may be sensitive to novel RAF dimer inhibitors or MEK inhibitors.\nClass 3 BRAF mutations (D287H, V459L, G466V, G466E, G466A, S467L, G469E, N581S, N581I, D594N, D594G, D594A, D594H, F595L, G596D, and G596R) with low or absent kinase activity are RAS-dependent and they activate ERK by increasing their binding to activated RAS and wild-type CRAF. Class 3 BRAF mutations coexist with mutations in RAS or NF1 in melanoma may be treated with MEK inhibitors. In epithelial tumors such as CRC or NSCLC may be effectively treated with combinations that include inhibitors of receptor tyrosine kinase.", }, - { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:1097", "label": "BRAF"}, - }, { "name": "aliases", "value": [ @@ -640,7 +679,16 @@ def civic_gid19(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "EGFR", + "code": "hgnc:3236", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], "extensions": [ { @@ -660,10 +708,6 @@ def civic_gid19(): "mENA", ], }, - { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:3236", "label": "EGFR"}, - }, ], } @@ -683,7 +727,16 @@ def civic_tid146(): "system": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "afatinib", + "code": "rxcui:1430438", + "system": "https://www.nlm.nih.gov/research/umls/rxnorm/index.html", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], "extensions": [ { @@ -721,13 +774,6 @@ def civic_tid146(): ], }, }, - { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "rxcui:1430438", - "label": "afatinib", - }, - }, ], } @@ -747,17 +793,24 @@ def civic_did8(): "system": "https://disease-ontology.org/?id=", }, "relation": "exactMatch", - } - ], - "extensions": [ + }, { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C2926", + "coding": { "label": "Lung Non-Small Cell Carcinoma", - "mondo_id": "mondo:0005233", + "code": "ncit:C2926", + "system": "http://purl.obolibrary.org/obo/ncit.owl", }, - } + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0005233", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -788,7 +841,16 @@ def civic_tid28(): "system": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "panitumumab", + "code": "rxcui:263034", + "system": "https://www.nlm.nih.gov/research/umls/rxnorm/index.html", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], "extensions": [ { @@ -806,13 +868,6 @@ def civic_tid28(): "Vectibix", ], }, - { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "rxcui:263034", - "label": "panitumumab", - }, - }, { "name": "regulatory_approval", "value": { @@ -854,7 +909,7 @@ def civic_tid28(): @pytest.fixture(scope="session") -def civic_tid16(cetuximab_extensions): +def civic_tid16(cetuximab_extensions, cetuximab_normalizer_mappings): """Create test fixture for CIViC therapy ID 16""" return { "id": "civic.tid:16", @@ -868,7 +923,8 @@ def civic_tid16(cetuximab_extensions): "system": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=", }, "relation": "exactMatch", - } + }, + *cetuximab_normalizer_mappings, ], "extensions": [ *cetuximab_extensions, @@ -906,13 +962,14 @@ def civic_tsg(civic_tid16, civic_tid28): @pytest.fixture(scope="session") -def civic_tid483(encorafenib_extensions): +def civic_tid483(encorafenib_extensions, encorafenib_normalizer_mappings): """Create test fixture for CIViC Therapy ID 483""" return { "id": "civic.tid:483", "conceptType": "Therapy", "label": "Encorafenib", "mappings": [ + *encorafenib_normalizer_mappings, { "coding": { "id": "ncit:C98283", @@ -920,7 +977,7 @@ def civic_tid483(encorafenib_extensions): "system": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=", }, "relation": "exactMatch", - } + }, ], "extensions": [ *encorafenib_extensions, @@ -960,17 +1017,24 @@ def civic_did11(): "system": "https://disease-ontology.org/?id=", }, "relation": "exactMatch", - } - ], - "extensions": [ + }, { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C4978", + "coding": { "label": "Malignant Colorectal Neoplasm", - "mondo_id": "mondo:0005575", + "code": "ncit:C4978", + "system": "http://purl.obolibrary.org/obo/ncit.owl", }, - } + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0005575", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -1392,16 +1456,6 @@ def civic_did3(): "id": "civic.did:3", "conceptType": "Disease", "label": "Acute Myeloid Leukemia", - "extensions": [ - { - "name": "vicc_normalizer_data", - "value": { - "id": "ncit:C3171", - "label": "Acute Myeloid Leukemia", - "mondo_id": "mondo:0018874", - }, - } - ], "mappings": [ { "coding": { @@ -1410,7 +1464,24 @@ def civic_did3(): "code": "DOID:9119", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "Acute Myeloid Leukemia", + "code": "ncit:C3171", + "system": "http://purl.obolibrary.org/obo/ncit.owl", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0018874", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -1431,10 +1502,6 @@ def civic_gid29(): "name": "aliases", "value": ["MASTC", "KIT", "SCFR", "PBT", "CD117", "C-Kit"], }, - { - "name": "vicc_normalizer_data", - "value": {"id": "hgnc:6342", "label": "KIT"}, - }, ], "mappings": [ { @@ -1444,7 +1511,16 @@ def civic_gid29(): "code": "3815", }, "relation": "exactMatch", - } + }, + { + "coding": { + "system": "https://www.genenames.org", + "code": "hgnc:6342", + "label": "KIT", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], } @@ -1636,10 +1712,15 @@ def moa_abl1(): "id": "moa.normalize.gene:ABL1", "conceptType": "Gene", "label": "ABL1", - "extensions": [ + "mappings": [ { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:76", "label": "ABL1"}, + "coding": { + "label": "ABL1", + "code": "hgnc:76", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), } ], } @@ -1764,12 +1845,16 @@ def moa_imatinib(): ], }, }, + ], + "mappings": [ { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "rxcui:282388", + "coding": { "label": "imatinib", + "code": "rxcui:282388", + "system": "https://www.nlm.nih.gov/research/umls/rxnorm/index.html", }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), }, ], } @@ -1782,16 +1867,6 @@ def moa_chronic_myelogenous_leukemia(): "id": "moa.normalize.disease.ncit:C3174", "conceptType": "Disease", "label": "Chronic Myelogenous Leukemia", - "extensions": [ - { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C3174", - "label": "Chronic Myeloid Leukemia, BCR-ABL1 Positive", - "mondo_id": "mondo:0011996", - }, - } - ], "mappings": [ { "coding": { @@ -1801,7 +1876,24 @@ def moa_chronic_myelogenous_leukemia(): "code": "CML", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "Chronic Myeloid Leukemia, BCR-ABL1 Positive", + "code": "ncit:C3174", + "system": "http://purl.obolibrary.org/obo/ncit.owl", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0011996", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } diff --git a/tests/unit/database/test_database.py b/tests/unit/database/test_database.py index 9c52194f..bed7b728 100644 --- a/tests/unit/database/test_database.py +++ b/tests/unit/database/test_database.py @@ -7,8 +7,8 @@ from neo4j.graph import Node from metakb.database import get_driver -from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData from metakb.schemas.app import SourceName +from metakb.transformers.base import NORMALIZER_PRIORITY_EXT_NAME @pytest.fixture(scope="module") @@ -154,16 +154,7 @@ def _check_function( ): checked = set() for ext in fixture_extensions: - if ext["name"] == VICC_NORMALIZER_DATA: - for normalized_field in ViccDiseaseNormalizerData.model_fields: - normalized_val = ext["value"].get(normalized_field) - if normalized_val is None: - continue - - ext_name = f"normalizer_{normalized_field}" - assert node[ext_name] == ext["value"][normalized_field] - checked.add(ext_name) - elif ext["name"] in ext_names: + if ext["name"] in ext_names: try: assert json.loads(node[ext["name"]]) == ext["value"] except json.decoder.JSONDecodeError: @@ -192,6 +183,12 @@ def _check_function( for k in expected_keys - extension_names: if k == "mappings" or (k == "subtype" and isinstance(fixture[k], dict)): assert json.loads(node[k]) == fixture[k] + elif k == "normalizer_id": + for mapping in fixture["mappings"]: + extensions = mapping.get("extensions") or [] + for ext in extensions: + if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]: + assert node[k] == mapping["coding"]["code"] elif isinstance(fixture[k], list): assert set(node[k]) == set(fixture[k]) else: @@ -224,11 +221,10 @@ def test_gene_rules( check_node_labels("Gene", expected_labels, 1) gene = get_node_by_id(civic_gid5["id"]) - extension_names = {"normalizer_label", "normalizer_id", "description", "aliases"} + extension_names = {"description", "aliases"} check_extension_props(gene, civic_gid5["extensions"], extension_names) expected_keys = { "normalizer_id", - "normalizer_label", "label", "id", "description", @@ -462,8 +458,6 @@ def test_therapy_rules( # Test Therapy ta = get_node_by_id(civic_tid146["id"]) extension_names = { - "normalizer_id", - "normalizer_label", "regulatory_approval", "aliases", } @@ -473,7 +467,6 @@ def test_therapy_rules( "label", "aliases", "normalizer_id", - "normalizer_label", "regulatory_approval", "mappings", "conceptType", @@ -514,22 +507,15 @@ def test_condition_rules( check_node_labels("Condition", expected_node_labels, 1) disease = get_node_by_id(civic_did8["id"]) - extension_names = { - "normalizer_id", - "normalizer_label", - "normalizer_mondo_id", - } - check_extension_props(disease, civic_did8["extensions"], extension_names) + expected_keys = { "id", "label", "mappings", "normalizer_id", - "normalizer_label", - "normalizer_mondo_id", "conceptType", } - check_node_props(disease, civic_did8, expected_keys, extension_names) + check_node_props(disease, civic_did8, expected_keys) def test_statement_rules( diff --git a/tests/unit/search/test_search_statements.py b/tests/unit/search/test_search_statements.py index 5b406e17..2e80a0f1 100644 --- a/tests/unit/search/test_search_statements.py +++ b/tests/unit/search/test_search_statements.py @@ -1,24 +1,23 @@ """Test search statement methods""" import pytest -from ga4gh.core.models import Extension +from ga4gh.core.models import MappableConcept -from metakb.normalizers import VICC_NORMALIZER_DATA from metakb.query import QueryHandler from .utils import assert_no_match, find_and_check_stmt -def _get_normalizer_id(extensions: list[Extension]) -> str | None: +def _get_normalizer_id(mappings: list[MappableConcept]) -> str | None: """Get normalized ID from list of extensions - :param extensions: List of extensions + :param mappings: List of mappable concepts :return: Normalized concept ID if found in extensions """ normalizer_id = None - for ext in extensions: - if ext.name == VICC_NORMALIZER_DATA: - normalizer_id = ext.value["id"] + for mapping in mappings: + if mapping.extensions == "from_vicc_normalizer": + normalizer_id = mapping.code.root break return normalizer_id @@ -217,11 +216,11 @@ async def test_general_search_statements(query_handler): tp = statement.proposition.objectTherapeutic.root if hasattr(tp, "conceptType"): - assert _get_normalizer_id(tp.extensions) == expected_therapy_id + assert _get_normalizer_id(tp.mappings) == expected_therapy_id else: found_expected = False for therapeutic in tp.therapies: - if _get_normalizer_id(therapeutic.extensions) == expected_therapy_id: + if _get_normalizer_id(therapeutic.mappings) == expected_therapy_id: found_expected = True break assert found_expected diff --git a/tests/unit/transformers/test_civic_transformer_diagnostic.py b/tests/unit/transformers/test_civic_transformer_diagnostic.py index 6c3ba52d..07b3f1ff 100644 --- a/tests/unit/transformers/test_civic_transformer_diagnostic.py +++ b/tests/unit/transformers/test_civic_transformer_diagnostic.py @@ -4,9 +4,8 @@ import pytest import pytest_asyncio -from tests.conftest import TEST_TRANSFORMERS_DIR +from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_ext -from metakb.normalizers import VICC_NORMALIZER_DATA from metakb.transformers.civic import CivicTransformer DATA_DIR = TEST_TRANSFORMERS_DIR / "diagnostic" @@ -187,7 +186,16 @@ def civic_gid38(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "PDGFRA", + "code": "hgnc:8803", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], "extensions": [ { @@ -198,10 +206,6 @@ def civic_gid38(): "name": "aliases", "value": ["CD140A", "PDGFR-2", "PDGFR2", "PDGFRA"], }, - { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:8803", "label": "PDGFRA"}, - }, ], } @@ -221,17 +225,24 @@ def civic_did2(): "system": "https://disease-ontology.org/?id=", }, "relation": "exactMatch", - } - ], - "extensions": [ + }, { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C3868", + "coding": { "label": "Gastrointestinal Stromal Tumor", - "mondo_id": "mondo:0011719", + "code": "ncit:C3868", + "system": "http://purl.obolibrary.org/obo/ncit.owl", }, - } + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0011719", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -448,7 +459,16 @@ def civic_gid42(): "system": "https://www.ncbi.nlm.nih.gov/gene/", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "RET", + "code": "hgnc:9967", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], "extensions": [ { @@ -469,10 +489,6 @@ def civic_gid42(): "RET-ELE1", ], }, - { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:9967", "label": "RET"}, - }, ], } @@ -492,17 +508,24 @@ def civic_did15(): "system": "https://disease-ontology.org/?id=", }, "relation": "exactMatch", - } - ], - "extensions": [ + }, { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C3879", + "coding": { "label": "Thyroid Gland Medullary Carcinoma", - "mondo_id": "mondo:0015277", + "code": "ncit:C3879", + "system": "http://purl.obolibrary.org/obo/ncit.owl", }, - } + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0015277", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } diff --git a/tests/unit/transformers/test_moa_transformer_prognostic.py b/tests/unit/transformers/test_moa_transformer_prognostic.py index 73246978..933c0de9 100644 --- a/tests/unit/transformers/test_moa_transformer_prognostic.py +++ b/tests/unit/transformers/test_moa_transformer_prognostic.py @@ -4,9 +4,8 @@ import pytest import pytest_asyncio -from tests.conftest import TEST_TRANSFORMERS_DIR +from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_ext -from metakb.normalizers import VICC_NORMALIZER_DATA from metakb.transformers.moa import MoaTransformer DATA_DIR = TEST_TRANSFORMERS_DIR / "prognostic" @@ -115,16 +114,6 @@ def moa_myelodysplasia(): "id": "moa.normalize.disease.ncit:C3247", "conceptType": "Disease", "label": "Myelodysplasia", - "extensions": [ - { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C3247", - "label": "Myelodysplastic Syndrome", - "mondo_id": "mondo:0018881", - }, - } - ], "mappings": [ { "coding": { @@ -134,7 +123,24 @@ def moa_myelodysplasia(): "id": "oncotree:MDS", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "Myelodysplastic Syndrome", + "code": "ncit:C3247", + "system": "http://purl.obolibrary.org/obo/ncit.owl", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0018881", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -146,11 +152,16 @@ def moa_bcor(): "id": "moa.normalize.gene:BCOR", "conceptType": "Gene", "label": "BCOR", - "extensions": [ + "mappings": [ { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:20893", "label": "BCOR"}, - } + "coding": { + "label": "BCOR", + "code": "hgnc:20893", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], } @@ -308,11 +319,16 @@ def moa_sf3b1(): "id": "moa.normalize.gene:SF3B1", "conceptType": "Gene", "label": "SF3B1", - "extensions": [ + "mappings": [ { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:10768", "label": "SF3B1"}, - } + "coding": { + "label": "SF3B1", + "code": "hgnc:10768", + "system": "https://www.genenames.org", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, ], } diff --git a/tests/unit/transformers/test_moa_transformer_therapeutic.py b/tests/unit/transformers/test_moa_transformer_therapeutic.py index a7647bd2..4426533e 100644 --- a/tests/unit/transformers/test_moa_transformer_therapeutic.py +++ b/tests/unit/transformers/test_moa_transformer_therapeutic.py @@ -4,9 +4,8 @@ import pytest import pytest_asyncio -from tests.conftest import TEST_TRANSFORMERS_DIR +from tests.conftest import TEST_TRANSFORMERS_DIR, get_vicc_normalizer_ext -from metakb.normalizers import VICC_NORMALIZER_DATA from metakb.transformers.moa import MoaTransformer DATA_DIR = TEST_TRANSFORMERS_DIR / "therapeutic" @@ -99,29 +98,37 @@ def moa_vid144(braf_v600e_genomic): @pytest.fixture(scope="module") -def moa_cetuximab(cetuximab_extensions): +def moa_cetuximab(cetuximab_extensions, cetuximab_normalizer_mappings): """Create a test fixture for MOA Cetuximab""" return { "id": "moa.normalize.therapy.rxcui:318341", "conceptType": "Therapy", "label": "Cetuximab", "extensions": cetuximab_extensions, + "mappings": cetuximab_normalizer_mappings, } @pytest.fixture(scope="module") -def moa_encorafenib(encorafenib_extensions): +def moa_encorafenib(encorafenib_extensions, encorafenib_normalizer_mappings): """Create test fixture for MOA Encorafenib""" return { "id": "moa.normalize.therapy.rxcui:2049106", "conceptType": "Therapy", "label": "Encorafenib", "extensions": encorafenib_extensions, + "mappings": encorafenib_normalizer_mappings, } @pytest.fixture(scope="module") -def moa_aid154_study_stmt(moa_vid144, moa_cetuximab, moa_encorafenib, moa_method): +def moa_aid154_study_stmt( + moa_vid144, + moa_cetuximab, + moa_encorafenib, + moa_method, + braf_normalizer_mappings, +): """Create MOA AID 154 study statement test fixture. Uses CombinationTherapy.""" return { "id": "moa.assertion:154", @@ -170,16 +177,6 @@ def moa_aid154_study_stmt(moa_vid144, moa_cetuximab, moa_encorafenib, moa_method "id": "moa.normalize.disease.ncit:C5105", "conceptType": "Disease", "label": "Colorectal Adenocarcinoma", - "extensions": [ - { - "name": VICC_NORMALIZER_DATA, - "value": { - "id": "ncit:C5105", - "label": "Colorectal Adenocarcinoma", - "mondo_id": "mondo:0005008", - }, - } - ], "mappings": [ { "coding": { @@ -189,7 +186,24 @@ def moa_aid154_study_stmt(moa_vid144, moa_cetuximab, moa_encorafenib, moa_method "id": "oncotree:COADREAD", }, "relation": "exactMatch", - } + }, + { + "coding": { + "label": "Colorectal Adenocarcinoma", + "code": "ncit:C5105", + "system": "http://purl.obolibrary.org/obo/ncit.owl", + }, + "relation": "exactMatch", + "extensions": get_vicc_normalizer_ext(is_priority=True), + }, + { + "coding": { + "code": "mondo:0005008", + "system": "http://purl.obolibrary.org/obo/mondo.owl", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], }, "alleleOriginQualifier": {"label": "somatic"}, @@ -197,12 +211,7 @@ def moa_aid154_study_stmt(moa_vid144, moa_cetuximab, moa_encorafenib, moa_method "id": "moa.normalize.gene:BRAF", "conceptType": "Gene", "label": "BRAF", - "extensions": [ - { - "name": VICC_NORMALIZER_DATA, - "value": {"id": "hgnc:1097", "label": "BRAF"}, - } - ], + "mappings": braf_normalizer_mappings, }, }, "specifiedBy": moa_method, From 704602a0e2b7921580c1db8fba248cfdd614c13f Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Mon, 27 Jan 2025 18:46:48 -0500 Subject: [PATCH 2/5] update tests --- tests/__init__.py | 1 + tests/conftest.py | 20 +++++++++++++ tests/unit/database/test_database.py | 9 ++---- tests/unit/search/test_search_statements.py | 31 ++++++++------------- 4 files changed, 36 insertions(+), 25 deletions(-) create mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..452f23f9 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Initialize the tests namespace for shared utilities.""" diff --git a/tests/conftest.py b/tests/conftest.py index 531047a2..8cf322db 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,10 +6,12 @@ import pytest from deepdiff import DeepDiff +from ga4gh.core.models import ConceptMapping from metakb.harvesters.base import Harvester from metakb.normalizers import ViccNormalizers from metakb.query import QueryHandler +from metakb.transformers.base import NORMALIZER_PRIORITY_EXT_NAME TEST_DATA_DIR = Path(__file__).resolve().parents[0] / "data" TEST_HARVESTERS_DIR = TEST_DATA_DIR / "harvesters" @@ -64,6 +66,24 @@ def get_vicc_normalizer_ext(is_priority: bool): return [{"name": "vicc_normalizer_priority", "value": is_priority}] +def get_mappings_normalizer_id(mappings: list[dict | ConceptMapping]) -> str | None: + """Get normalizer ID from list of concept mappings + + :param mappings: List of concept mappings + :return: Normalizer ID + """ + normalizer_id = None + for mapping in mappings: + if isinstance(mapping, ConceptMapping): + mapping = mapping.model_dump() + extensions = mapping.get("extensions") or [] + for ext in extensions: + if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]: + normalizer_id = mapping["coding"]["code"] + break + return normalizer_id + + @pytest.fixture(scope="session") def braf_normalizer_mappings(): """Create test fixture for braf normalizer mappings""" diff --git a/tests/unit/database/test_database.py b/tests/unit/database/test_database.py index bed7b728..2bc8ee8f 100644 --- a/tests/unit/database/test_database.py +++ b/tests/unit/database/test_database.py @@ -5,10 +5,10 @@ import pytest from neo4j import Driver from neo4j.graph import Node +from tests.conftest import get_mappings_normalizer_id from metakb.database import get_driver from metakb.schemas.app import SourceName -from metakb.transformers.base import NORMALIZER_PRIORITY_EXT_NAME @pytest.fixture(scope="module") @@ -184,11 +184,8 @@ def _check_function( if k == "mappings" or (k == "subtype" and isinstance(fixture[k], dict)): assert json.loads(node[k]) == fixture[k] elif k == "normalizer_id": - for mapping in fixture["mappings"]: - extensions = mapping.get("extensions") or [] - for ext in extensions: - if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]: - assert node[k] == mapping["coding"]["code"] + normalizer_id = get_mappings_normalizer_id(fixture["mappings"]) + assert node[k] == normalizer_id elif isinstance(fixture[k], list): assert set(node[k]) == set(fixture[k]) else: diff --git a/tests/unit/search/test_search_statements.py b/tests/unit/search/test_search_statements.py index 2e80a0f1..da38ac12 100644 --- a/tests/unit/search/test_search_statements.py +++ b/tests/unit/search/test_search_statements.py @@ -1,27 +1,13 @@ """Test search statement methods""" import pytest -from ga4gh.core.models import MappableConcept +from tests.conftest import get_mappings_normalizer_id from metakb.query import QueryHandler from .utils import assert_no_match, find_and_check_stmt -def _get_normalizer_id(mappings: list[MappableConcept]) -> str | None: - """Get normalized ID from list of extensions - - :param mappings: List of mappable concepts - :return: Normalized concept ID if found in extensions - """ - normalizer_id = None - for mapping in mappings: - if mapping.extensions == "from_vicc_normalizer": - normalizer_id = mapping.code.root - break - return normalizer_id - - def assert_general_search_stmts(response): """Check that general search_statements queries return a valid response""" len_stmt_id_matches = len(response.statement_ids) @@ -216,11 +202,14 @@ async def test_general_search_statements(query_handler): tp = statement.proposition.objectTherapeutic.root if hasattr(tp, "conceptType"): - assert _get_normalizer_id(tp.mappings) == expected_therapy_id + assert get_mappings_normalizer_id(tp.mappings) == expected_therapy_id else: found_expected = False for therapeutic in tp.therapies: - if _get_normalizer_id(therapeutic.mappings) == expected_therapy_id: + if ( + get_mappings_normalizer_id(therapeutic.mappings) + == expected_therapy_id + ): found_expected = True break assert found_expected @@ -245,11 +234,15 @@ async def test_general_search_statements(query_handler): == expected_variation_id ) assert ( - _get_normalizer_id(statement.proposition.objectTherapeutic.root.extensions) + get_mappings_normalizer_id( + statement.proposition.objectTherapeutic.root.mappings + ) == expected_therapy_id ) assert ( - _get_normalizer_id(statement.proposition.conditionQualifier.root.extensions) + get_mappings_normalizer_id( + statement.proposition.conditionQualifier.root.mappings + ) == expected_disease_id ) From 53770c743afde217898dcb13547549b4faeac693 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Mon, 27 Jan 2025 18:57:22 -0500 Subject: [PATCH 3/5] cleanup --- src/metakb/load_data.py | 45 +++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/src/metakb/load_data.py b/src/metakb/load_data.py index 658ccf4c..81593323 100644 --- a/src/metakb/load_data.py +++ b/src/metakb/load_data.py @@ -32,12 +32,29 @@ def _create_parameterized_query( def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None: """Get mappings and extensions from object and add to `obj` and `obj_keys` - :param obj: Object to update with mappings and extensions (if found) + :param obj: Object to update with mappings and extensions (if found). + If ``obj`` has Disease, Gene, or Therapy ``conceptType``, then ``normalizer_id`` + will also be added. :param obj_keys: Parameterized queries. This will be mutated if mappings and extensions exists """ mappings = obj.get("mappings", []) if mappings: + concept_type = obj.get("conceptType") + if concept_type in {"Disease", "Gene", "Therapy"}: + normalizer_id = None + for mapping in obj["mappings"]: + extensions = mapping.get("extensions") or [] + for ext in extensions: + if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]: + normalizer_id = mapping["coding"]["code"] + obj["normalizer_id"] = normalizer_id + obj_keys.append("normalizer_id:$normalizer_id") + break + + if normalizer_id: + break + obj["mappings"] = json.dumps(mappings) obj_keys.append("mappings:$mappings") @@ -88,26 +105,6 @@ def _add_method(tx: ManagedTransaction, method: dict, ids_in_stmts: set[str]) -> tx.run(query, **method) -def _add_normalizer_id_to_obj(obj: dict, obj_keys: list[str]) -> None: - """Get normalizer ID and add to ``obj`` and ``obj_keys`` - - :param obj: Object to update with ``normalizer_id`` - :param obj_keys: Parameterized queries. This will be mutated. - """ - normalizer_id = None - for mapping in obj["mappings"]: - extensions = mapping.get("extensions") - if extensions: - for ext in extensions: - if ext["name"] == NORMALIZER_PRIORITY_EXT_NAME and ext["value"]: - normalizer_id = mapping["coding"]["code"] - break - - if normalizer_id: - obj["normalizer_id"] = normalizer_id - obj_keys.append("normalizer_id:$normalizer_id") - - def _add_gene_or_disease( tx: ManagedTransaction, obj_in: dict, ids_in_stmts: set[str] ) -> None: @@ -131,9 +128,6 @@ def _add_gene_or_disease( obj["conceptType"] = obj_type obj_keys = [_create_parameterized_query(obj, ("id", "label", "conceptType"))] - _add_normalizer_id_to_obj( - obj, obj_keys - ) # must be before _add_mappings_and_exts_to_obj _add_mappings_and_exts_to_obj(obj, obj_keys) obj_keys = ", ".join(obj_keys) @@ -204,9 +198,6 @@ def _add_therapy(tx: ManagedTransaction, therapy_in: dict) -> None: _create_parameterized_query(therapy, ("id", "label", "conceptType")) ] - _add_normalizer_id_to_obj( - therapy, nonnull_keys - ) # must be before _add_mappings_and_exts_to_obj _add_mappings_and_exts_to_obj(therapy, nonnull_keys) nonnull_keys = ", ".join(nonnull_keys) From 0d3113a20482deeaa4b24044bff226117ca3d052 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 5 Feb 2025 14:41:11 -0500 Subject: [PATCH 4/5] updates needed for varcat --- src/metakb/transformers/base.py | 92 +++++++++++-------- src/metakb/transformers/civic.py | 55 ++++++++--- tests/conftest.py | 37 ++++++-- .../test_civic_transformer_diagnostic.py | 18 ++-- .../test_moa_transformer_prognostic.py | 16 ++++ .../test_moa_transformer_therapeutic.py | 13 ++- 6 files changed, 163 insertions(+), 68 deletions(-) diff --git a/src/metakb/transformers/base.py b/src/metakb/transformers/base.py index f49a8802..ff91635f 100644 --- a/src/metakb/transformers/base.py +++ b/src/metakb/transformers/base.py @@ -8,9 +8,6 @@ from pathlib import Path from typing import ClassVar -from disease.schemas import ( - SYSTEM_URI_TO_NAMESPACE as DISEASE_SYSTEM_URI_TO_NAMESPACE, -) from disease.schemas import ( NamespacePrefix as DiseaseNamespacePrefix, ) @@ -34,7 +31,12 @@ ) from ga4gh.va_spec.base import Document, Method, TherapyGroup from ga4gh.vrs.models import Allele -from gene.schemas import NormalizeService as NormalizedGene +from gene.schemas import ( + NamespacePrefix as GeneNamespacePrefix, +) +from gene.schemas import ( + NormalizeService as NormalizedGene, +) from pydantic import BaseModel, Field, StrictStr, ValidationError from therapy.schemas import NormalizationService as NormalizedTherapy @@ -112,6 +114,14 @@ class ViccConceptVocab(BaseModel): definition: StrictStr +class _TransformedRecordsCache(BaseModel): + """Define model for caching transformed records""" + + therapies: ClassVar[dict[str, MappableConcept]] = {} + conditions: ClassVar[dict[str, MappableConcept]] = {} + genes: ClassVar[dict[str, MappableConcept]] = {} + + class TransformedData(BaseModel): """Define model for transformed data""" @@ -521,8 +531,7 @@ def _add_therapy( if therapy: self.able_to_normalize["therapies"][therapy_id] = therapy self.processed_data.therapies.append(therapy) - else: - self.unable_to_normalize["therapies"].add(therapy_id) + return therapy @staticmethod @@ -537,20 +546,24 @@ def _get_vicc_normalizer_mappings( :return: List of VICC Normalizer data represented as mappable concept """ - def _add_merged_id_ext( + def _update_mapping( mapping: ConceptMapping, - is_priority: bool, - label: str | None = None, + normalized_id: str, + normalizer_label: str, ) -> Extension: - """Update ``mapping`` to include extension on whether mapping is from merged identifier + """Update ``mapping`` to include extension on whether ``mapping`` contains + code that matches the merged record's primary identifier. :param mapping: ConceptMapping from vicc normalizer. This will be mutated. - :param is_priority: ``True`` if concept mapping contains primaryCode that - matches merged record primaryCode. ``False`` otherwise (meaning it comes - from merged record mappings) - :param label: Merged concept label, if found - :return: ConceptMapping with normalizer extension added + Extensions will be added. Label will be added if mapping identifier + matches normalized merged identifier. + :param normalized_id: Concept ID from normalized record + :param normalizer_label: Label from normalized record + :return: ConceptMapping with normalizer extension added as well as label ( + if mapping id matches normalized merged id) """ + is_priority = normalized_id == mapping.coding.code.root + merged_id_ext = Extension( name=NORMALIZER_PRIORITY_EXT_NAME, value=is_priority ) @@ -559,39 +572,40 @@ def _add_merged_id_ext( else: mapping.extensions = [merged_id_ext] - if label: - mapping.coding.label = label + if is_priority: + mapping.coding.label = normalizer_label return mapping mappings: list[ConceptMapping] = [] attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)] normalizer_resp_obj = getattr(normalizer_resp, attr_name) + normalizer_label = normalizer_resp_obj.label + is_disease = isinstance(normalizer_resp, NormalizedDisease) + is_gene = isinstance(normalizer_resp, NormalizedGene) + normalizer_mappings = normalizer_resp_obj.mappings or [] - if isinstance(normalizer_resp, NormalizedDisease): - for mapping in normalizer_mappings: + for mapping in normalizer_mappings: + if normalized_id == mapping.coding.code.root: + mappings.append( + _update_mapping(mapping, normalized_id, normalizer_label) + ) + else: + mapping_code_lower = mapping.coding.code.root.lower() if ( - DISEASE_SYSTEM_URI_TO_NAMESPACE.get(mapping.coding.system) - == DiseaseNamespacePrefix.MONDO.value + is_disease + and mapping_code_lower.startswith( + DiseaseNamespacePrefix.MONDO.value + ) + ) or ( + is_gene + and mapping_code_lower.startswith( + (GeneNamespacePrefix.NCBI.value, GeneNamespacePrefix.HGNC.value) + ) ): - mappings.append(_add_merged_id_ext(mapping, is_priority=False)) - else: - if normalized_id == mapping.coding.code.root: - mappings.append( - _add_merged_id_ext( - mapping, - label=normalizer_resp_obj.label, - is_priority=True, - ) - ) - else: - mappings.extend( - _add_merged_id_ext( - mapping, label=normalizer_resp_obj.label, is_priority=True - ) - for mapping in normalizer_mappings - if normalized_id == mapping.coding.code.root - ) + mappings.append( + _update_mapping(mapping, normalized_id, normalizer_label) + ) return mappings def create_json(self, cdm_filepath: Path | None = None) -> None: diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py index a8c699e8..1b54e957 100644 --- a/src/metakb/transformers/civic.py +++ b/src/metakb/transformers/civic.py @@ -788,28 +788,59 @@ def _add_genes(self, genes: list[dict]) -> None: :param genes: All genes in CIViC """ + + def _get_ncbi_mapping(ncbigene: str, gene: dict) -> ConceptMapping: + return ConceptMapping( + coding=Coding( + id=ncbigene, + code=str(gene["entrez_id"]), + system="https://www.ncbi.nlm.nih.gov/gene/", + ), + relation=Relation.EXACT_MATCH, + ) + for gene in genes: gene_id = f"civic.gid:{gene['id']}" ncbigene = f"ncbigene:{gene['entrez_id']}" queries = [ncbigene, gene["name"]] + gene["aliases"] + extensions = [] gene_norm_resp, normalized_gene_id = self.vicc_normalizers.normalize_gene( queries ) - mappings = [ - ConceptMapping( - coding=Coding( - id=ncbigene, - code=str(gene["entrez_id"]), - system="https://www.ncbi.nlm.nih.gov/gene/", - ), - relation=Relation.EXACT_MATCH, - ), - *self._get_vicc_normalizer_mappings(normalized_gene_id, gene_norm_resp), - ] + if not normalized_gene_id: + _logger.debug( + "Gene Normalizer unable to normalize: %s using queries %s", + gene_id, + queries, + ) + extensions.append(self._get_vicc_normalizer_failure_ext()) + mappings = [_get_ncbi_mapping(ncbigene, gene)] + else: + mappings = self._get_vicc_normalizer_mappings( + normalized_gene_id, gene_norm_resp + ) + + civic_ncbi_annotation_match = False + for mapping in mappings: + if mapping.coding.code.root.startswith("ncbigene:"): + if mapping.coding.code.root == ncbigene: + mapping.extensions.append( + Extension(name="civic_annotation", value=True) + ) + civic_ncbi_annotation_match = True + break + + _logger.debug( + "CIViC NCBI gene and Gene Normalizer mismatch: %s vs %s", + ncbigene, + mapping.coding.code.root, + ) + + if not civic_ncbi_annotation_match: + mappings.append(_get_ncbi_mapping(ncbigene, gene)) - extensions = [] if gene["aliases"]: extensions.append(Extension(name="aliases", value=gene["aliases"])) diff --git a/tests/conftest.py b/tests/conftest.py index 8cf322db..6b0f79b0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -467,11 +467,14 @@ def civic_gid5(braf_normalizer_mappings): "mappings": [ { "coding": { - "id": "ncbigene:673", - "code": "673", + "code": "ncbigene:673", "system": "https://www.ncbi.nlm.nih.gov/gene/", }, - "relation": "exactMatch", + "relation": "relatedMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, *braf_normalizer_mappings, ], @@ -694,11 +697,14 @@ def civic_gid19(): "mappings": [ { "coding": { - "id": "ncbigene:1956", - "code": "1956", + "code": "ncbigene:1956", "system": "https://www.ncbi.nlm.nih.gov/gene/", }, - "relation": "exactMatch", + "relation": "relatedMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { @@ -1527,10 +1533,13 @@ def civic_gid29(): { "coding": { "system": "https://www.ncbi.nlm.nih.gov/gene/", - "id": "ncbigene:3815", - "code": "3815", + "code": "ncbigene:3815", }, - "relation": "exactMatch", + "relation": "relatedMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { @@ -1741,7 +1750,15 @@ def moa_abl1(): }, "relation": "exactMatch", "extensions": get_vicc_normalizer_ext(is_priority=True), - } + }, + { + "coding": { + "code": "ncbigene:25", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } diff --git a/tests/unit/transformers/test_civic_transformer_diagnostic.py b/tests/unit/transformers/test_civic_transformer_diagnostic.py index 07b3f1ff..dbf9776e 100644 --- a/tests/unit/transformers/test_civic_transformer_diagnostic.py +++ b/tests/unit/transformers/test_civic_transformer_diagnostic.py @@ -181,11 +181,14 @@ def civic_gid38(): "mappings": [ { "coding": { - "id": "ncbigene:5156", - "code": "5156", + "code": "ncbigene:5156", "system": "https://www.ncbi.nlm.nih.gov/gene/", }, - "relation": "exactMatch", + "relation": "relatedMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { @@ -454,11 +457,14 @@ def civic_gid42(): "mappings": [ { "coding": { - "id": "ncbigene:5979", - "code": "5979", + "code": "ncbigene:5979", "system": "https://www.ncbi.nlm.nih.gov/gene/", }, - "relation": "exactMatch", + "relation": "relatedMatch", + "extensions": [ + *get_vicc_normalizer_ext(is_priority=False), + {"name": "civic_annotation", "value": True}, + ], }, { "coding": { diff --git a/tests/unit/transformers/test_moa_transformer_prognostic.py b/tests/unit/transformers/test_moa_transformer_prognostic.py index 933c0de9..750b99af 100644 --- a/tests/unit/transformers/test_moa_transformer_prognostic.py +++ b/tests/unit/transformers/test_moa_transformer_prognostic.py @@ -162,6 +162,14 @@ def moa_bcor(): "relation": "exactMatch", "extensions": get_vicc_normalizer_ext(is_priority=True), }, + { + "coding": { + "code": "ncbigene:54880", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } @@ -329,6 +337,14 @@ def moa_sf3b1(): "relation": "exactMatch", "extensions": get_vicc_normalizer_ext(is_priority=True), }, + { + "coding": { + "code": "ncbigene:23451", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, ], } diff --git a/tests/unit/transformers/test_moa_transformer_therapeutic.py b/tests/unit/transformers/test_moa_transformer_therapeutic.py index 4426533e..767a36af 100644 --- a/tests/unit/transformers/test_moa_transformer_therapeutic.py +++ b/tests/unit/transformers/test_moa_transformer_therapeutic.py @@ -130,6 +130,17 @@ def moa_aid154_study_stmt( braf_normalizer_mappings, ): """Create MOA AID 154 study statement test fixture. Uses CombinationTherapy.""" + braf_normalizer_mappings_cpy = braf_normalizer_mappings[:] + braf_normalizer_mappings_cpy.append( + { + "coding": { + "code": "ncbigene:673", + "system": "https://www.ncbi.nlm.nih.gov/gene/", + }, + "relation": "relatedMatch", + "extensions": get_vicc_normalizer_ext(is_priority=False), + }, + ) return { "id": "moa.assertion:154", "type": "Statement", @@ -211,7 +222,7 @@ def moa_aid154_study_stmt( "id": "moa.normalize.gene:BRAF", "conceptType": "Gene", "label": "BRAF", - "mappings": braf_normalizer_mappings, + "mappings": braf_normalizer_mappings_cpy, }, }, "specifiedBy": moa_method, From a012460bbca09ebd8a95608d05dbe0fdf2134a0a Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 5 Feb 2025 16:12:28 -0500 Subject: [PATCH 5/5] docs --- src/metakb/transformers/civic.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py index 1b54e957..616724b6 100644 --- a/src/metakb/transformers/civic.py +++ b/src/metakb/transformers/civic.py @@ -789,10 +789,16 @@ def _add_genes(self, genes: list[dict]) -> None: :param genes: All genes in CIViC """ - def _get_ncbi_mapping(ncbigene: str, gene: dict) -> ConceptMapping: + def _get_ncbi_concept_mapping(ncbigene_id: str, gene: dict) -> ConceptMapping: + """Get NCBI gene mapping + + :param ncbigene_id: ID for NCBI Gene + :param gene: CIViC gene record + :return: Concept Mapping for NCBI Gene + """ return ConceptMapping( coding=Coding( - id=ncbigene, + id=ncbigene_id, code=str(gene["entrez_id"]), system="https://www.ncbi.nlm.nih.gov/gene/", ), @@ -816,7 +822,7 @@ def _get_ncbi_mapping(ncbigene: str, gene: dict) -> ConceptMapping: queries, ) extensions.append(self._get_vicc_normalizer_failure_ext()) - mappings = [_get_ncbi_mapping(ncbigene, gene)] + mappings = [_get_ncbi_concept_mapping(ncbigene, gene)] else: mappings = self._get_vicc_normalizer_mappings( normalized_gene_id, gene_norm_resp @@ -839,7 +845,7 @@ def _get_ncbi_mapping(ncbigene: str, gene: dict) -> ConceptMapping: ) if not civic_ncbi_annotation_match: - mappings.append(_get_ncbi_mapping(ncbigene, gene)) + mappings.append(_get_ncbi_concept_mapping(ncbigene, gene)) if gene["aliases"]: extensions.append(Extension(name="aliases", value=gene["aliases"]))