diff --git a/genedescriptions/config_parser.py b/genedescriptions/config_parser.py index ef2be67..7f8d304 100644 --- a/genedescriptions/config_parser.py +++ b/genedescriptions/config_parser.py @@ -26,6 +26,7 @@ class ConfigModuleProperty(Enum): SLIM_URL = 15 SLIM_BONUS_PERC = 16 REMAP_TERMS = 17 + DO_NOT_TRIM_BRANCH_AT = 18 class GenedescConfigParser(object): @@ -104,6 +105,8 @@ def _get_module_property_name(prop: ConfigModuleProperty): property_name = "slim_bonus_perc" elif prop == ConfigModuleProperty.REMAP_TERMS: property_name = "remap_terms" + elif prop == ConfigModuleProperty.DO_NOT_TRIM_BRANCH_AT: + property_name = "do_not_trim_branch_at" return property_name def get_prepostfix_sentence_map(self, module: Module, special_cases_only: bool = False, humans: bool = False): diff --git a/genedescriptions/descriptions_generator.py b/genedescriptions/descriptions_generator.py index 965b9c6..4387cea 100644 --- a/genedescriptions/descriptions_generator.py +++ b/genedescriptions/descriptions_generator.py @@ -153,6 +153,21 @@ def get_module_sentences(self, aspect: str, qualifier: str = '', put_anatomy_male_at_end=True if aspect == 'A' else False) return ModuleSentences(sentences) + def separate_do_not_trim_from_trim_terms(self, term_ids: List[str]): + branch_root_ids = self.config.get_module_property(module=self.module, + prop=ConfigModuleProperty.DO_NOT_TRIM_BRANCH_AT) + if branch_root_ids: + do_not_trim_terms = [] + trim_terms = [] + for term_id in term_ids: + if node_is_in_branch(ontology=self.ontology, node_id=term_id, branch_root_ids=branch_root_ids): + do_not_trim_terms.append(term_id) + else: + trim_terms.append(term_id) + return do_not_trim_terms, trim_terms + else: + return [], term_ids + def reduce_num_terms(self, terms: Set[str], min_distance_from_root: int = 0) -> TrimmingResult: """ Reduce the initial set of terms by resolving parent child relationships, deleting overlap with previous @@ -177,11 +192,15 @@ def reduce_num_terms(self, terms: Set[str], min_distance_from_root: int = 0) -> max_terms = self.config.get_module_property(module=self.module, prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE) if 0 < max_terms < len(terms): - trimming_result = self.trimmer.trim(terms, max_terms, min_distance_from_root) + do_not_trim_terms, trim_terms = self.separate_do_not_trim_from_trim_terms(term_ids=terms) + trimming_result = self.trimmer.trim(trim_terms, max_terms, min_distance_from_root) + if do_not_trim_terms: + trimming_result.final_terms.extend(do_not_trim_terms) + trimming_result.covered_nodes.update(do_not_trim_terms) else: trimming_result.final_terms = terms trimming_result.covered_nodes = terms - self.terms_already_covered.update(terms) + self.terms_already_covered.update(terms) if self.config.get_module_property(module=self.module, prop=ConfigModuleProperty.DEL_CHILDREN_IF_PARENT): trimming_result.final_terms = self.remove_children_if_parents_present( terms=trimming_result.final_terms, ontology=self.ontology, diff --git a/genedescriptions/ontology_tools.py b/genedescriptions/ontology_tools.py index 0f8b996..c55b1ea 100644 --- a/genedescriptions/ontology_tools.py +++ b/genedescriptions/ontology_tools.py @@ -208,4 +208,9 @@ def _set_information_content_in_subgraph(ontology: Ontology, root_id: str, maxle relations=relations) +def node_is_in_branch(ontology: Ontology, node_id: str, branch_root_ids: List[str]): + branch_root_ids = set(branch_root_ids) + return any([parent_id in branch_root_ids for parent_id in ontology.ancestors(node=node_id, reflexive=True)]) + + diff --git a/wormbase/config_wb.yml b/wormbase/config_wb.yml index 4aa29da..4ff2a0f 100644 --- a/wormbase/config_wb.yml +++ b/wormbase/config_wb.yml @@ -548,6 +548,9 @@ do_exp_sentences_options: - "DOID:0080015" - "DOID:0050117" - "DOID:0080014" + do_not_trim_branch_at: + - "DOID:10652" + - "DOID:9884" evidence_codes: IMP: group: EXPERIMENTAL @@ -636,6 +639,9 @@ do_via_orth_sentences_options: - "DOID:0080015" - "DOID:0050117" - "DOID:0080014" + do_not_trim_branch_at: + - "DOID:10652" + - "DOID:9884" evidence_codes: ISS: group: ORTHOLOGY_BASED