From 036e7582f2ee2b96f18b5737b4981b97c9922546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Mart=C3=ADnez=20Galindo?= Date: Thu, 15 Aug 2024 05:21:34 +0100 Subject: [PATCH 1/6] :sparkles: Added LinkerGLINER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcos Martínez Galindo --- .../dataset/med_mentions/entities.py | 8 +++ .../evaluation/dataset/ontonotes/entities.py | 13 +++++ zshot/evaluation/evaluator.py | 16 ++++- zshot/evaluation/run_evaluation.py | 46 +++++++++++++-- zshot/evaluation/zshot_evaluate.py | 8 ++- zshot/linker/__init__.py | 1 + zshot/linker/linker_gliner.py | 58 +++++++++++++++++++ zshot/utils/data_models/span.py | 7 ++- 8 files changed, 143 insertions(+), 14 deletions(-) create mode 100644 zshot/linker/linker_gliner.py diff --git a/zshot/evaluation/dataset/med_mentions/entities.py b/zshot/evaluation/dataset/med_mentions/entities.py index 294861d..81e6f57 100644 --- a/zshot/evaluation/dataset/med_mentions/entities.py +++ b/zshot/evaluation/dataset/med_mentions/entities.py @@ -10,6 +10,14 @@ "NEG": "NEG" } +MEDMENTIONS_EXPLANATORY_MAPPING = { + k: v.replace("_", " ") for k, v in MEDMENTIONS_TYPE_INV.items() +} + +MEDMENTIONS_EXPLANATORY_INVERSE_MAPPING = { + v:k for k, v in MEDMENTIONS_EXPLANATORY_MAPPING.items() +} + MEDMENTIONS_SPLITS = { "train": ['Biologic_Function', 'Chemical', 'Health_Care_Activity', 'Anotomical_Structure', "Finding", "Spatial_Concept", "Intellectual_Product", "Research_Activity", 'Medical_Device', 'Eukaryote', diff --git a/zshot/evaluation/dataset/ontonotes/entities.py b/zshot/evaluation/dataset/ontonotes/entities.py index f54bad0..e95640a 100644 --- a/zshot/evaluation/dataset/ontonotes/entities.py +++ b/zshot/evaluation/dataset/ontonotes/entities.py @@ -1,5 +1,18 @@ from zshot.utils.data_models import Entity +ONTONOTES_EXPLANATORY_MAPPING = { + 'PERSON': "Person", "NORP": "Affiliation", "FAC": "Building name", + "ORG": "Organization", "GPE": "Geopolitical Entity", "LOC": "Location", "PRODUCT": "Product", + "DATE": "Date", "TIME": "Time", "PERCENT": "Percentage", "MONEY": "Money", + "QUANTITY": "Quantity", "ORDINAL": "Ordinal", "CARDINAL": "Cardinal", "EVENT": "Event", + "WORK_OF_ART": "Work of Art", "LAW": "Law", "LANGUAGE": "Language", + "NEG": "NEG" +} + +ONTONOTES_EXPLANATORY_INVERSE_MAPPING = { + v:k for k, v in ONTONOTES_EXPLANATORY_MAPPING.items() +} + ONTONOTES_ENTITIES = [Entity(name='NEG', description="Coal, water, oil, etc. are normally used for traditional electricity " "generation. However using liquefied natural gas as fuel for joint " diff --git a/zshot/evaluation/evaluator.py b/zshot/evaluation/evaluator.py index 6edd34c..412eea5 100644 --- a/zshot/evaluation/evaluator.py +++ b/zshot/evaluation/evaluator.py @@ -10,12 +10,19 @@ class ZeroShotTokenClassificationEvaluator(TokenClassificationEvaluator): def __init__(self, task="token-classification", default_metric_name=None, - mode: Optional[str] = 'span', alignment_mode=AlignmentMode.expand): + mode: Optional[str] = 'span', alignment_mode=AlignmentMode.expand, + entity_mapper: Optional[Dict[str, str]] = None): super().__init__(task, default_metric_name) self.alignment_mode = alignment_mode self.mode = mode + self.entity_mapper = entity_mapper def process_label(self, label): + if label != "O": + if self.entity_mapper is not None: + label_prefix = label[:2] + label = label_prefix + self.entity_mapper[label[2:]] + return f"B-{label[2:]}" if label.startswith("I-") and self.mode == 'token' else label def prepare_data(self, data: Union[str, Dataset], input_column: str, label_column: str, join_by: str): @@ -55,12 +62,17 @@ def prepare_pipeline( class MentionsExtractorEvaluator(ZeroShotTokenClassificationEvaluator): def __init__(self, task="token-classification", default_metric_name=None, - mode: Optional[str] = 'span', alignment_mode=AlignmentMode.expand): + mode: Optional[str] = 'span', alignment_mode=AlignmentMode.expand, + entity_mapper: Optional[Dict[str, str]] = None): super().__init__(task, default_metric_name, alignment_mode=alignment_mode) self.mode = mode + self.entity_mapper = entity_mapper def process_label(self, label): if label != "O": + if self.entity_mapper is not None: + label_prefix = label[:2] + label = label_prefix + self.entity_mapper[label[2:]] if (label.startswith("B-") or label.startswith("I-")) and self.mode == 'span': label = label[:2] + "MENTION" else: diff --git a/zshot/evaluation/run_evaluation.py b/zshot/evaluation/run_evaluation.py index 3e7e463..ac41e83 100644 --- a/zshot/evaluation/run_evaluation.py +++ b/zshot/evaluation/run_evaluation.py @@ -1,11 +1,19 @@ import argparse +from typing import Union import spacy +from datasets import DatasetDict + from zshot import PipelineConfig from zshot.evaluation import load_medmentions_zs, load_ontonotes_zs +from zshot.evaluation.dataset.dataset import DatasetWithEntities +from zshot.evaluation.dataset.med_mentions.entities import MEDMENTIONS_EXPLANATORY_MAPPING, \ + MEDMENTIONS_EXPLANATORY_INVERSE_MAPPING +from zshot.evaluation.dataset.ontonotes.entities import ONTONOTES_EXPLANATORY_INVERSE_MAPPING, \ + ONTONOTES_EXPLANATORY_MAPPING from zshot.evaluation.metrics.seqeval.seqeval import Seqeval from zshot.evaluation.zshot_evaluate import evaluate, prettify_evaluate_report -from zshot.linker import LinkerTARS, LinkerSMXM, LinkerRegen +from zshot.linker import LinkerTARS, LinkerSMXM, LinkerRegen, LinkerGLINER from zshot.mentions_extractor import MentionsExtractorSpacy, MentionsExtractorFlair, \ MentionsExtractorSMXM, MentionsExtractorTARS from zshot.mentions_extractor.utils import ExtractorType @@ -21,23 +29,43 @@ LINKERS = { "regen": LinkerRegen, "tars": LinkerTARS, - "smxm": LinkerSMXM + "smxm": LinkerSMXM, + "gliner": LinkerGLINER +} +END2END = ['tars', 'smxm', 'gliner'] +ENTITIES_MAPPERS = { + "medmentions": MEDMENTIONS_EXPLANATORY_MAPPING, + "ontonotes": ONTONOTES_EXPLANATORY_MAPPING +} +ENTITIES_INVERSE_MAPPERS = { + "medmentions": MEDMENTIONS_EXPLANATORY_INVERSE_MAPPING, + "ontonotes": ONTONOTES_EXPLANATORY_INVERSE_MAPPING } -END2END = ['tars', 'smxm'] + + +def convert_entities(dataset: DatasetWithEntities, dataset_name: str) -> DatasetWithEntities: + mapping = ENTITIES_MAPPERS[dataset_name] + for entity in dataset.entities: + entity.name = mapping[entity.name] + + return dataset + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dataset", default="ontonotes", type=str, - help="Name or path to the validation data. Comma separated") + help="Name or names of the datasets. One of: ontonotes; medmentions. Comma separated") parser.add_argument("--splits", required=False, default="test", type=str, help="Splits to evaluate. Comma separated") parser.add_argument("--mode", required=False, default="full", type=str, help="Evaluation mode. One of: full; mentions_extractor; linker") + parser.add_argument("--entity_name", default="original", type=str, + help="Type of entity name. One of: original; explanatory. Original by default.") parser.add_argument("--mentions_extractor", required=False, default="all", type=str, help="Mentions extractor to evaluate. " "One of: all; spacy_pos; spacy_ner; flair_pos; flair_ner; smxm; tars") parser.add_argument("--linker", required=False, default="all", type=str, - help="Linker to evaluate. One of: all; regen; smxm; tars") + help="Linker to evaluate. One of: all; regen; smxm; tars; gliner") parser.add_argument("--show_full_report", action="store_false", help="Show evalution report for each label. True by default") @@ -99,8 +127,14 @@ dataset = load_ontonotes_zs(split) else: raise ValueError(f"{dataset_name} not supported") + + if args.entity_name == "explanatory": + convert_entities(dataset, dataset_name) + nlp.get_pipe("zshot").mentions = dataset.entities nlp.get_pipe("zshot").entities = dataset.entities - evaluation = evaluate(nlp, dataset, metric=Seqeval(), mode=mode) + evaluation = evaluate(nlp, dataset, metric=Seqeval(), mode=mode, + entity_mapper=ENTITIES_MAPPERS[dataset_name] if args.entity_name != "original" + else None) print("\n".join(prettify_evaluate_report(evaluation, name=f"{dataset_name}-{split}"))) diff --git a/zshot/evaluation/zshot_evaluate.py b/zshot/evaluation/zshot_evaluate.py index 4695953..93dc9d1 100644 --- a/zshot/evaluation/zshot_evaluate.py +++ b/zshot/evaluation/zshot_evaluate.py @@ -14,7 +14,8 @@ def evaluate(nlp: spacy.language.Language, dataset: Dataset, metric: Optional[Union[str, EvaluationModule]] = Seqeval(), mode: Optional[str] = 'span', - batch_size: Optional[int] = 16) -> dict: + batch_size: Optional[int] = 16, + entity_mapper: Optional[Dict[str, str]] = None) -> dict: """ Evaluate a spacy zshot model :param nlp: Spacy Language pipeline with ZShot components @@ -25,10 +26,11 @@ def evaluate(nlp: spacy.language.Language, - token: The evaluation is done at token level, so if any of the tokens of the entity is missing the other are still valid :param batch_size: the batch size + :param entity_mapper: Mapper for entity names :return: Result of the evaluation. Dict with metrics results for each component """ - linker_evaluator = ZeroShotTokenClassificationEvaluator(mode=mode) - mentions_extractor_evaluator = MentionsExtractorEvaluator(mode=mode) + linker_evaluator = ZeroShotTokenClassificationEvaluator(mode=mode, entity_mapper=entity_mapper) + mentions_extractor_evaluator = MentionsExtractorEvaluator(mode=mode, entity_mapper=entity_mapper) results = {'evaluation_mode': mode} if nlp.get_pipe("zshot").linker: diff --git a/zshot/linker/__init__.py b/zshot/linker/__init__.py index 334b037..98a6c21 100644 --- a/zshot/linker/__init__.py +++ b/zshot/linker/__init__.py @@ -4,3 +4,4 @@ from zshot.linker.linker_smxm import LinkerSMXM # noqa: F401 from zshot.linker.linker_tars import LinkerTARS # noqa: F401 from zshot.linker.linker_ensemble import LinkerEnsemble # noqa: F401 +from zshot.linker.linker_gliner import LinkerGLINER # noqa: F401 diff --git a/zshot/linker/linker_gliner.py b/zshot/linker/linker_gliner.py new file mode 100644 index 0000000..addac6a --- /dev/null +++ b/zshot/linker/linker_gliner.py @@ -0,0 +1,58 @@ +from typing import Iterator, List, Optional, Union + +import pkgutil + +from spacy.tokens import Doc +from gliner import GLiNER + +from zshot.config import MODELS_CACHE_PATH +from zshot.linker.linker import Linker +from zshot.utils.data_models import Span + + +MODEL_NAME = "urchade/gliner_mediumv2.1" + + +class LinkerGLINER(Linker): + """ GLINER linker """ + + def __init__(self, model_name=MODEL_NAME): + super().__init__() + + if not pkgutil.find_loader("gliner"): + raise Exception("GLINER module not installed. You need to install gliner in order to use the GLINER Linker." + "Install it with: pip install gliner") + + self.model_name = model_name + self.model = None + + @property + def is_end2end(self) -> bool: + """ GLINER is end2end model""" + return True + + def load_models(self): + """ Load GLINER model """ + if self.model is None: + self.model = GLiNER.from_pretrained(self.model_name, cache_dir=MODELS_CACHE_PATH).to(self.device) + + def predict(self, docs: Iterator[Doc], batch_size: Optional[Union[int, None]] = None) -> List[List[Span]]: + """ + Perform the entity prediction + :param docs: A list of spacy Document + :param batch_size: The batch size + :return: List Spans for each Document in docs + """ + if not self._entities: + return [] + + labels = [ent.name for ent in self._entities] + sentences = [doc.text for doc in docs] + + self.load_models() + span_annotations = [] + for sent in sentences: + entities = self.model.predict_entities(sent, labels, threshold=0.5) + span_annotations.append([Span.from_dict(ent) for ent in entities]) + + return span_annotations diff --git a/zshot/utils/data_models/span.py b/zshot/utils/data_models/span.py index 786b0ac..2f17905 100644 --- a/zshot/utils/data_models/span.py +++ b/zshot/utils/data_models/span.py @@ -1,5 +1,6 @@ from typing import Any, Dict + import zlib import spacy @@ -57,10 +58,10 @@ def from_dict(d: Dict[str, Any]) -> "Span": end = d.get('end', None) if 'end' in d else d.get('end_char', None) label = d.get('label', None) score = d.get('score', None) - kb_id = d.get('kb_id', None) - if not start: + kb_id = d.get('kb_id', '') + if start is None: raise ValueError('One of [start, start_char] must be defined in dict.') - if not end: + if end is None: raise ValueError('One of [end, end_char] must be defined in dict.') if not label: raise ValueError('Label must be defined in dict.') From a15819d9751910e77788686eb927beff421499ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Mart=C3=ADnez=20Galindo?= Date: Thu, 15 Aug 2024 05:36:25 +0100 Subject: [PATCH 2/6] :sparkles: Added GLiNER Mentions Extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcos Martínez Galindo --- zshot/evaluation/run_evaluation.py | 7 +-- zshot/mentions_extractor/__init__.py | 1 + .../mentions_extractor_gliner.py | 53 +++++++++++++++++++ 3 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 zshot/mentions_extractor/mentions_extractor_gliner.py diff --git a/zshot/evaluation/run_evaluation.py b/zshot/evaluation/run_evaluation.py index ac41e83..914843e 100644 --- a/zshot/evaluation/run_evaluation.py +++ b/zshot/evaluation/run_evaluation.py @@ -15,7 +15,7 @@ from zshot.evaluation.zshot_evaluate import evaluate, prettify_evaluate_report from zshot.linker import LinkerTARS, LinkerSMXM, LinkerRegen, LinkerGLINER from zshot.mentions_extractor import MentionsExtractorSpacy, MentionsExtractorFlair, \ - MentionsExtractorSMXM, MentionsExtractorTARS + MentionsExtractorSMXM, MentionsExtractorTARS, MentionsExtractorGLINER from zshot.mentions_extractor.utils import ExtractorType MENTION_EXTRACTORS = { @@ -24,7 +24,8 @@ "flair_pos": lambda: MentionsExtractorFlair(ExtractorType.POS), "flair_ner": lambda: MentionsExtractorFlair(ExtractorType.NER), "smxm": lambda: MentionsExtractorSMXM, - "tars": lambda: MentionsExtractorTARS + "tars": lambda: MentionsExtractorTARS, + "gliner": lambda: MentionsExtractorGLINER } LINKERS = { "regen": LinkerRegen, @@ -63,7 +64,7 @@ def convert_entities(dataset: DatasetWithEntities, dataset_name: str) -> Dataset help="Type of entity name. One of: original; explanatory. Original by default.") parser.add_argument("--mentions_extractor", required=False, default="all", type=str, help="Mentions extractor to evaluate. " - "One of: all; spacy_pos; spacy_ner; flair_pos; flair_ner; smxm; tars") + "One of: all; spacy_pos; spacy_ner; flair_pos; flair_ner; smxm; tars; gliner") parser.add_argument("--linker", required=False, default="all", type=str, help="Linker to evaluate. One of: all; regen; smxm; tars; gliner") parser.add_argument("--show_full_report", action="store_false", diff --git a/zshot/mentions_extractor/__init__.py b/zshot/mentions_extractor/__init__.py index 9738c1e..199b5a1 100644 --- a/zshot/mentions_extractor/__init__.py +++ b/zshot/mentions_extractor/__init__.py @@ -2,4 +2,5 @@ from zshot.mentions_extractor.mentions_extractor_spacy import MentionsExtractorSpacy # noqa: F401 from zshot.mentions_extractor.mentions_extractor_smxm import MentionsExtractorSMXM # noqa: F401 from zshot.mentions_extractor.mentions_extractor_tars import MentionsExtractorTARS # noqa: F401 +from zshot.mentions_extractor.mentions_extractor_gliner import MentionsExtractorGLINER # noqa: F401 from zshot.mentions_extractor.mentions_extractor import MentionsExtractor # noqa: F401 diff --git a/zshot/mentions_extractor/mentions_extractor_gliner.py b/zshot/mentions_extractor/mentions_extractor_gliner.py new file mode 100644 index 0000000..120e2a9 --- /dev/null +++ b/zshot/mentions_extractor/mentions_extractor_gliner.py @@ -0,0 +1,53 @@ +from typing import Iterator, List, Optional, Union + +import pkgutil + +from spacy.tokens import Doc +from gliner import GLiNER + +from zshot.mentions_extractor.mentions_extractor import MentionsExtractor +from zshot.config import MODELS_CACHE_PATH +from zshot.utils.data_models import Span + + +MODEL_NAME = "urchade/gliner_mediumv2.1" + + +class MentionsExtractorGLINER(MentionsExtractor): + """ GLiNER Mentions Extractor """ + + def __init__(self, model_name=MODEL_NAME): + super().__init__() + + if not pkgutil.find_loader("gliner"): + raise Exception("GLINER module not installed. You need to install gliner in order to use the GLINER Linker." + "Install it with: pip install gliner") + + self.model_name = model_name + self.model = None + + def load_models(self): + """ Load GLINER model """ + if self.model is None: + self.model = GLiNER.from_pretrained(self.model_name, cache_dir=MODELS_CACHE_PATH).to(self.device) + + def predict(self, docs: Iterator[Doc], batch_size: Optional[Union[int, None]] = None) -> List[List[Span]]: + """ + Perform the entity prediction + :param docs: A list of spacy Document + :param batch_size: The batch size + :return: List Spans for each Document in docs + """ + if not self._mentions: + return [] + + labels = [ent.name for ent in self._mentions] + sentences = [doc.text for doc in docs] + + self.load_models() + span_annotations = [] + for sent in sentences: + entities = self.model.predict_entities(sent, labels, threshold=0.5) + span_annotations.append([Span.from_dict(ent) for ent in entities]) + + return span_annotations From 10c3076987fa62c1e739509baacd6eeeef49f7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Mart=C3=ADnez=20Galindo?= Date: Thu, 15 Aug 2024 05:41:03 +0100 Subject: [PATCH 3/6] :memo: Update docs with GLiNER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcos Martínez Galindo --- README.md | 13 +++++++------ docs/entity_linking.md | 3 +-- docs/gliner_linker.md | 11 +++++++++++ docs/gliner_mentions_extractor.md | 11 +++++++++++ docs/mentions_extractor.md | 5 ++++- docs/tars_mentions_extractor.md | 2 +- 6 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 docs/gliner_linker.md create mode 100644 docs/gliner_mentions_extractor.md diff --git a/README.md b/README.md index a69167d..fe27d4e 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,9 @@ Can be used to perform: ### Optional Dependencies -* flair - Required if you want to use Flair mentions extractor and for TARS linker. +* flair - Required if you want to use Flair mentions extractor and for TARS linker and TARS Mentions Extractor. * blink - Required if you want to use Blink for linking to Wikipedia pages. - +* gliner - Required if you want to use GLiNER Linker or GLiNER Mentions Extractor. ## Installation @@ -81,7 +81,7 @@ ZShot contains two different components, the **mentions extractor** and the **li ### Mentions Extractor The **mentions extractor** will detect the possible entities (a.k.a. mentions), that will be then linked to a data source (e.g.: Wikidata) by the **linker**. -Currently, there are 6 different **mentions extractors** supported, SMXM, TARS, 2 based on *SpaCy*, and 2 that are based on *Flair*. The two different versions for *SpaCy* and *Flair* are similar, one is based on Named Entity Recognition and Classification (NERC) and the other one is based on the linguistics (i.e.: using Part Of the Speech tagging (PoS) and Dependency Parsing(DP)). +Currently, there are 7 different **mentions extractors** supported, SMXM, TARS, GLiNER, 2 based on *SpaCy*, and 2 that are based on *Flair*. The two different versions for *SpaCy* and *Flair* are similar, one is based on Named Entity Recognition and Classification (NERC) and the other one is based on the linguistics (i.e.: using Part Of the Speech tagging (PoS) and Dependency Parsing(DP)). The NERC approach will use NERC models to detect all the entities that have to be linked. This approach depends on the model that is being used, and the entities the model has been trained on, so depending on the use case and the target entities it may be not the best approach, as the entities may be not recognized by the NERC model and thus won't be linked. @@ -90,14 +90,15 @@ The linguistic approach relies on the idea that mentions will usually be a synta ### Linker The **linker** will link the detected entities to a existing set of labels. Some of the **linkers**, however, are *end-to-end*, i.e. they don't need the **mentions extractor**, as they detect and link the entities at the same time. -Again, there are 4 **linkers** available currently, 2 of them are *end-to-end* and 2 are not. Let's start with those thar are not *end-to-end*: +Again, there are 5 **linkers** available currently, 3 of them are *end-to-end* and 2 are not. | Linker Name | end-to-end | Source Code | Paper | |:-----------:|:----------:|----------------------------------------------------------|--------------------------------------------------------------------| | Blink | X | [Source Code](https://github.com/facebookresearch/BLINK) | [Paper](https://arxiv.org/pdf/1911.03814.pdf) | | GENRE | X | [Source Code](https://github.com/facebookresearch/GENRE) | [Paper](https://arxiv.org/pdf/2010.00904.pdf) | -| SMXM | ✓ | [Source Code](https://github.com/Raldir/Zero-shot-NERC) | [Paper](https://aclanthology.org/2021.acl-long.120/) | -| TARS | ✓ | [Source Code](https://github.com/flairNLP/flair) | [Paper](https://kishaloyhalder.github.io/pdfs/tars_coling2020.pdf) | +| SMXM | ✓ | [Source Code](https://github.com/Raldir/Zero-shot-NERC) | [Paper](https://aclanthology.org/2021.acl-long.120/) | +| TARS | ✓ | [Source Code](https://github.com/flairNLP/flair) | [Paper](https://kishaloyhalder.github.io/pdfs/tars_coling2020.pdf) | +| GLINER | ✓ | [Source Code](https://github.com/urchade/GLiNER) | [Paper](https://arxiv.org/abs/2311.08526) | ### Relations Extractor The **relations extractor** will extract relations among different entities *previously* extracted by a **linker**.. diff --git a/docs/entity_linking.md b/docs/entity_linking.md index 010c4fc..0eb68d6 100644 --- a/docs/entity_linking.md +++ b/docs/entity_linking.md @@ -2,7 +2,6 @@ The **linker** will link the detected entities to a existing set of labels. Some of the **linkers**, however, are *end-to-end*, i.e. they don't need the **mentions extractor**, as they detect and link the entities at the same time. -There are 4 **linkers** available currently, 2 of them are *end-to-end* and 2 are not. Let's start with those thar are not *end-to-end*. - +There are 5 **linkers** available currently, 3 of them are *end-to-end* and 2 are not. ::: zshot.Linker \ No newline at end of file diff --git a/docs/gliner_linker.md b/docs/gliner_linker.md new file mode 100644 index 0000000..fd26c1b --- /dev/null +++ b/docs/gliner_linker.md @@ -0,0 +1,11 @@ +# GLiNER Linker + +GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios. + +The GLiNER **linker** will use the **entities** specified in the `zshot.PipelineConfig`, it just uses the names of the entities, it doesn't use the descriptions of the entities. + + +- [Paper](https://arxiv.org/abs/2311.08526) +- [Original Source Code](https://github.com/urchade/GLiNER) + +::: zshot.linker.LinkerGLINER \ No newline at end of file diff --git a/docs/gliner_mentions_extractor.md b/docs/gliner_mentions_extractor.md new file mode 100644 index 0000000..47929ea --- /dev/null +++ b/docs/gliner_mentions_extractor.md @@ -0,0 +1,11 @@ +# GLiNER Mentions Extractor + +GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios. + +The GLiNER **mentions extractor** will use the **mentions** specified in the `zshot.PipelineConfig`, it just uses the names of the mentions, it doesn't use the descriptions of the mentions. + + +- [Paper](https://arxiv.org/abs/2311.08526) +- [Original Source Code](https://github.com/urchade/GLiNER) + +::: zshot.mentions_extractor.MentionsExtractorGLINER \ No newline at end of file diff --git a/docs/mentions_extractor.md b/docs/mentions_extractor.md index 26658f9..e732697 100644 --- a/docs/mentions_extractor.md +++ b/docs/mentions_extractor.md @@ -1,7 +1,7 @@ # MentionsExtractor The **mentions extractor** will detect the possible entities (a.k.a. mentions), that will be then linked to a data source (e.g.: Wikidata) by the **linker**. -Currently, there are 6 different **mentions extractors** supported, 2 of them are based on *SpaCy*, 2 of them are based on *Flair*, TARS and SMXM. The two different versions for *SpaCy* and *Flair* are similar, one is based on NERC and the other one is based on the linguistics (i.e.: using PoS and DP). The TARS and SMXM models can be used when the user wants to specify the mentions wanted to be extracted. +Currently, there are 7 different **mentions extractors** supported, 2 of them are based on *SpaCy*, 2 of them are based on *Flair*, TARS, SMXM and GLiNER. The two different versions for *SpaCy* and *Flair* are similar, one is based on NERC and the other one is based on the linguistics (i.e.: using PoS and DP). The TARS and SMXM models can be used when the user wants to specify the mentions wanted to be extracted. The NERC approach will use NERC models to detect all the entities that have to be linked. This approach depends on the model that is being used, and the entities the model has been trained on, so depending on the use case and the target entities it may be not the best approach, as the entities may be not recognized by the NERC model and thus won't be linked. @@ -10,4 +10,7 @@ The linguistic approach relies on the idea that mentions will usually be a synta The SMXM model uses the description of the mentions to give the model information about them. TARS model will use the labels of the mentions to detect them. + +The GLiNER model will use the labels of the mentions to detect them. + ::: zshot.MentionsExtractor \ No newline at end of file diff --git a/docs/tars_mentions_extractor.md b/docs/tars_mentions_extractor.md index 5a84d27..854cc32 100644 --- a/docs/tars_mentions_extractor.md +++ b/docs/tars_mentions_extractor.md @@ -7,4 +7,4 @@ The TARS **mentions extractor** will use the **mentions** specified in the `zsho - [Paper](https://kishaloyhalder.github.io/pdfs/tars_coling2020.pdf) - [Original Source Code](https://github.com/flairNLP/flair) -::: zshot.linker.LinkerTARS \ No newline at end of file +::: zshot.mentions_extractor.MentionsExtractorTARS \ No newline at end of file From 250bd0b5cfc84faecd99388b3aa3cbf05b4aaeb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Mart=C3=ADnez=20Galindo?= Date: Thu, 15 Aug 2024 05:48:55 +0100 Subject: [PATCH 4/6] :white_check_mark: Add GLiNER tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcos Martínez Galindo --- zshot/tests/linker/test_gliner_linker.py | 59 ++++++++++++++++ .../test_gliner_mentions_extractor.py | 70 +++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 zshot/tests/linker/test_gliner_linker.py create mode 100644 zshot/tests/mentions_extractor/test_gliner_mentions_extractor.py diff --git a/zshot/tests/linker/test_gliner_linker.py b/zshot/tests/linker/test_gliner_linker.py new file mode 100644 index 0000000..24ea27a --- /dev/null +++ b/zshot/tests/linker/test_gliner_linker.py @@ -0,0 +1,59 @@ +import gc +import logging + +import pytest +import spacy + +from zshot import PipelineConfig, Linker +from zshot.linker import LinkerGLINER +from zshot.tests.config import EX_DOCS, EX_ENTITIES + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="module", autouse=True) +def teardown(): + logger.warning("Starting smxm tests") + yield True + gc.collect() + + +def test_gliner_download(): + linker = LinkerGLINER() + linker.load_models() + assert isinstance(linker, Linker) + del linker.model, linker + + +def test_smxm_linker(): + nlp = spacy.blank("en") + gliner_config = PipelineConfig( + linker=LinkerGLINER(), + entities=EX_ENTITIES + ) + nlp.add_pipe("zshot", config=gliner_config, last=True) + assert "zshot" in nlp.pipe_names + + doc = nlp(EX_DOCS[1]) + assert len(doc.ents) > 0 + docs = [doc for doc in nlp.pipe(EX_DOCS)] + assert all(len(doc.ents) > 0 for doc in docs) + del nlp.get_pipe('zshot').linker.model, nlp.get_pipe('zshot').linker + nlp.remove_pipe('zshot') + del doc, nlp, gliner_config + + +def test_smxm_linker_no_entities(): + nlp = spacy.blank("en") + gliner_config = PipelineConfig( + linker=LinkerGLINER(), + entities=[] + ) + nlp.add_pipe("zshot", config=gliner_config, last=True) + assert "zshot" in nlp.pipe_names + + doc = nlp(EX_DOCS[1]) + assert len(doc.ents) == 0 + del nlp.get_pipe('zshot').linker.model, nlp.get_pipe('zshot').linker + nlp.remove_pipe('zshot') + del doc, nlp, gliner_config diff --git a/zshot/tests/mentions_extractor/test_gliner_mentions_extractor.py b/zshot/tests/mentions_extractor/test_gliner_mentions_extractor.py new file mode 100644 index 0000000..264e663 --- /dev/null +++ b/zshot/tests/mentions_extractor/test_gliner_mentions_extractor.py @@ -0,0 +1,70 @@ +import gc +import logging + +import pytest +import spacy + +from zshot import PipelineConfig, MentionsExtractor +from zshot.mentions_extractor import MentionsExtractorGLINER +from zshot.tests.config import EX_DOCS, EX_ENTITIES + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="module", autouse=True) +def teardown(): + logger.warning("Starting smxm tests") + yield True + gc.collect() + + +def test_gliner_download(): + mentions_extractor = MentionsExtractorGLINER() + mentions_extractor.load_models() + assert isinstance(mentions_extractor, MentionsExtractor) + del mentions_extractor + + +def test_gliner_mentions_extractor(): + nlp = spacy.blank("en") + gliner_config = PipelineConfig( + mentions_extractor=MentionsExtractorGLINER(), + mentions=EX_ENTITIES + ) + nlp.add_pipe("zshot", config=gliner_config, last=True) + assert "zshot" in nlp.pipe_names + + doc = nlp(EX_DOCS[1]) + assert len(doc._.mentions) > 0 + nlp.remove_pipe('zshot') + del doc, nlp + + +def test_gliner_mentions_extractor_pipeline(): + nlp = spacy.blank("en") + gliner_config = PipelineConfig( + mentions_extractor=MentionsExtractorGLINER(), + mentions=EX_ENTITIES + ) + nlp.add_pipe("zshot", config=gliner_config, last=True) + assert "zshot" in nlp.pipe_names + + docs = [doc for doc in nlp.pipe(EX_DOCS)] + assert all(len(doc._.mentions) > 0 for doc in docs) + nlp.remove_pipe('zshot') + del docs, nlp + + +def test_gliner_mentions_extractor_no_entities(): + nlp = spacy.blank("en") + gliner_config = PipelineConfig( + mentions_extractor=MentionsExtractorGLINER(), + mentions=[] + ) + nlp.add_pipe("zshot", config=gliner_config, last=True) + assert "zshot" in nlp.pipe_names + + doc = nlp(EX_DOCS[1]) + assert len(doc._.mentions) == 0 + nlp.remove_pipe('zshot') + del doc, nlp From de0707888e883698248c10bfb82edb185f129226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Mart=C3=ADnez=20Galindo?= Date: Thu, 15 Aug 2024 05:52:45 +0100 Subject: [PATCH 5/6] Updated requirements test with GLiNER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcos Martínez Galindo --- requirements/test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/test.txt b/requirements/test.txt index 6a34802..9b82063 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -3,6 +3,7 @@ pytest-cov>=3.0.0 setuptools>=65.5.1 scipy<1.13.0 flair>=0.13 +gliner>=0.2.9 flake8>=4.0.1 coverage>=6.4.1 pydantic==1.9.2 From ba4850c14dfe81ff8cb42982f0c59c1ac10001ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20Mart=C3=ADnez=20Galindo?= Date: Thu, 15 Aug 2024 05:59:37 +0100 Subject: [PATCH 6/6] :art: Fix Flake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcos Martínez Galindo --- zshot/evaluation/dataset/med_mentions/entities.py | 2 +- zshot/evaluation/dataset/ontonotes/entities.py | 2 +- zshot/evaluation/run_evaluation.py | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/zshot/evaluation/dataset/med_mentions/entities.py b/zshot/evaluation/dataset/med_mentions/entities.py index 81e6f57..2b3615d 100644 --- a/zshot/evaluation/dataset/med_mentions/entities.py +++ b/zshot/evaluation/dataset/med_mentions/entities.py @@ -15,7 +15,7 @@ } MEDMENTIONS_EXPLANATORY_INVERSE_MAPPING = { - v:k for k, v in MEDMENTIONS_EXPLANATORY_MAPPING.items() + v: k for k, v in MEDMENTIONS_EXPLANATORY_MAPPING.items() } MEDMENTIONS_SPLITS = { diff --git a/zshot/evaluation/dataset/ontonotes/entities.py b/zshot/evaluation/dataset/ontonotes/entities.py index e95640a..d7acf69 100644 --- a/zshot/evaluation/dataset/ontonotes/entities.py +++ b/zshot/evaluation/dataset/ontonotes/entities.py @@ -10,7 +10,7 @@ } ONTONOTES_EXPLANATORY_INVERSE_MAPPING = { - v:k for k, v in ONTONOTES_EXPLANATORY_MAPPING.items() + v: k for k, v in ONTONOTES_EXPLANATORY_MAPPING.items() } ONTONOTES_ENTITIES = [Entity(name='NEG', diff --git a/zshot/evaluation/run_evaluation.py b/zshot/evaluation/run_evaluation.py index 914843e..773744f 100644 --- a/zshot/evaluation/run_evaluation.py +++ b/zshot/evaluation/run_evaluation.py @@ -1,8 +1,6 @@ import argparse -from typing import Union import spacy -from datasets import DatasetDict from zshot import PipelineConfig from zshot.evaluation import load_medmentions_zs, load_ontonotes_zs