diff --git a/changelog.md b/changelog.md index be94c2340..208fe57c1 100644 --- a/changelog.md +++ b/changelog.md @@ -11,6 +11,7 @@ - Measurements now correctly match "0.X", "0.XX", ... numbers - Typo in "celsius" measurement unit +- Spaces and digits are now supported in BRAT entity labels ## v0.10.2 diff --git a/edsnlp/connectors/brat.py b/edsnlp/connectors/brat.py index 0a93fe53e..b796a62c3 100644 --- a/edsnlp/connectors/brat.py +++ b/edsnlp/connectors/brat.py @@ -45,6 +45,7 @@ def __init__( directory: Union[str, Path], n_jobs: int = 1, attributes: Optional[AttributesMappingArg] = None, + bool_attributes: Optional[List[str]] = [], span_groups: SpanSetterArg = ["ents", "*"], keep_raw_attribute_values: bool = False, ): @@ -57,6 +58,7 @@ def __init__( self.attr_map = attributes self.span_setter = validate_span_setter(span_groups) self.keep_raw_attribute_values = keep_raw_attribute_values + self.bool_attributes = list(bool_attributes) def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]: res = read_standoff( @@ -66,7 +68,7 @@ def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]: span_attributes=self.attr_map, span_setter=self.span_setter, keep_raw_attribute_values=self.keep_raw_attribute_values, - bool_attributes=[], + bool_attributes=self.bool_attributes, ) return list(nlp.pipe(res) if run_pipe else res) diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py index 6d98f2b34..bc759bbd1 100644 --- a/edsnlp/data/converters.py +++ b/edsnlp/data/converters.py @@ -179,7 +179,7 @@ def __init__( ): self.tokenizer = tokenizer or (nlp.tokenizer if nlp is not None else None) self.span_setter = span_setter - self.span_attributes = span_attributes + self.span_attributes = span_attributes # type: ignore self.keep_raw_attribute_values = keep_raw_attribute_values self.bool_attributes = bool_attributes @@ -190,10 +190,12 @@ def __call__(self, obj): spans = [] - if self.span_attributes is not None: - for dst in self.span_attributes.values(): - if not Span.has_extension(dst): - Span.set_extension(dst, default=None) + for dst in ( + *(() if self.span_attributes is None else self.span_attributes.values()), + *self.bool_attributes, + ): + if not Span.has_extension(dst): + Span.set_extension(dst, default=None) for ent in obj.get("entities") or (): for fragment in ent["fragments"]: @@ -351,10 +353,12 @@ def __call__(self, obj): spans = [] - if self.span_attributes is not None: - for dst in self.span_attributes.values(): - if not Span.has_extension(dst): - Span.set_extension(dst, default=None) + for dst in ( + *(() if self.span_attributes is None else self.span_attributes.values()), + *self.bool_attributes, + ): + if not Span.has_extension(dst): + Span.set_extension(dst, default=None) for ent in obj.get("entities") or (): ent = dict(ent) diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py index aa753ea82..f44775fd7 100644 --- a/edsnlp/data/standoff.py +++ b/edsnlp/data/standoff.py @@ -29,10 +29,10 @@ from edsnlp.utils.collections import flatten_once from edsnlp.utils.span_getters import SpanSetterArg -REGEX_ENTITY = re.compile(r"^(T\d+)\t(\S+)([^\t]+)\t(.*)$") +REGEX_ENTITY = re.compile(r"^(T\d+)\t(.*) (\d+ \d+(?:;\d+ \d+)*)\t(.*)$") REGEX_NOTE = re.compile(r"^(#\d+)\tAnnotatorNotes ([^\t]+)\t(.*)$") REGEX_RELATION = re.compile(r"^(R\d+)\t(\S+) Arg1:(\S+) Arg2:(\S+)") -REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+)$") +REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+?) ([TE]\d+)(?: (.+))?$") REGEX_EVENT = re.compile(r"^(E\d+)\t(.+)$") REGEX_EVENT_PART = re.compile(r"(\S+):([TE]\d+)") @@ -131,19 +131,14 @@ def parse_standoff_file(path: str, merge_spaced_fragments: bool = True) -> Dict: match = REGEX_ATTRIBUTE.match(line) if match is None: raise BratParsingError(ann_file, line) - parts = match.group(2).split(" ") - if len(parts) >= 3: - entity, entity_id, value = parts - elif len(parts) == 2: - entity, entity_id = parts - value = None - else: + _, attr_name, entity_id, value = match.groups() + if attr_name is None: raise BratParsingError(ann_file, line) ( entities[entity_id] if entity_id.startswith("T") else events[entity_id] - )["attributes"][entity] = value + )["attributes"][attr_name] = value elif line.startswith("R"): match = REGEX_RELATION.match(line) if match is None: diff --git a/tests/data/test_standoff.py b/tests/data/test_standoff.py index e2f0fffda..b72bd6b3c 100644 --- a/tests/data/test_standoff.py +++ b/tests/data/test_standoff.py @@ -72,12 +72,12 @@ def brat2(tmpdir) -> BratConnector: @pytest.fixture def brat_importer(): brat_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data" - return BratConnector(str(brat_dir)) + return BratConnector(str(brat_dir), bool_attributes=["bool flag 0"]) @pytest.fixture def brat_exporter(tmpdir): - return BratConnector(tmpdir, attributes=["etat", "assertion"]) + return BratConnector(tmpdir, attributes=["etat", "assertion", "bool flag 0"]) def test_empty_brat(brat2: BratConnector, blank_nlp: PipelineProtocol): @@ -128,7 +128,7 @@ def test_docs2brat(nlp, brat2): def assert_doc_read(doc): assert doc._.note_id == "subfolder/doc-1" - attrs = ("etat", "assertion") + attrs = ("etat", "assertion", "bool flag 0") spans_and_attributes = { "__ents__": sorted( [ @@ -149,35 +149,36 @@ def assert_doc_read(doc): assert spans_and_attributes == { "__ents__": [ - (6, 7, "douleurs", (None, None)), - (7, 11, "dans le bras droit", (None, None)), - (17, 21, "problème \nde locomotion", (None, "absent")), - (25, 26, "AVC", ("passé", "non-associé")), - (35, 36, "rhume", ("présent", "hypothétique")), - (45, 46, "rhume", ("présent", "hypothétique")), - (51, 52, "Douleurs", (None, None)), - (52, 56, "dans le bras droit", (None, None)), - (68, 69, "anomalie", (None, "absent")), + (6, 7, "douleurs", (None, None, False)), + (7, 11, "dans le bras droit", (None, None, False)), + (17, 21, "problème \nde locomotion", (None, "absent", True)), + (25, 26, "AVC", ("passé", "non-associé", False)), + (35, 36, "rhume", ("présent", "hypothétique", False)), + (45, 46, "rhume", ("présent", "hypothétique", False)), + (51, 52, "Douleurs", (None, None, False)), + (52, 56, "dans le bras droit", (None, None, False)), + (68, 69, "anomalie", (None, "absent", False)), ], "anatomie": [ - (9, 11, "bras droit", (None, None)), - (54, 56, "bras droit", (None, None)), + (9, 11, "bras droit", (None, None, False)), + (54, 56, "bras droit", (None, None, False)), ], "localisation": [ - (7, 11, "dans le bras droit", (None, None)), - (52, 56, "dans le bras droit", (None, None)), + (7, 11, "dans le bras droit", (None, None, False)), + (52, 56, "dans le bras droit", (None, None, False)), ], "pathologie": [ - (17, 21, "problème \nde locomotion", (None, "absent")), - (25, 26, "AVC", ("passé", "non-associé")), - (35, 36, "rhume", ("présent", "hypothétique")), - (45, 46, "rhume", ("présent", "hypothétique")), + (17, 21, "problème \nde locomotion", (None, "absent", True)), + (25, 26, "AVC", ("passé", "non-associé", False)), + (35, 36, "rhume", ("présent", "hypothétique", False)), + (45, 46, "rhume", ("présent", "hypothétique", False)), ], "sosy": [ - (6, 7, "douleurs", (None, None)), - (51, 52, "Douleurs", (None, None)), - (68, 69, "anomalie", (None, "absent")), + (6, 7, "douleurs", (None, None, False)), + (51, 52, "Douleurs", (None, None, False)), + (68, 69, "anomalie", (None, "absent", False)), ], + "test label 0": [(68, 69, "anomalie", (None, "absent", False))], } @@ -189,20 +190,23 @@ def assert_doc_write(exported_ann_text): "T3 anatomie 47 57 bras droit\n" "T4 pathologie 75 83;85 98 problème de locomotion\n" "A2 assertion T4 absent\n" + "A3 bool flag 0 T4\n" "T5 pathologie 114 117 AVC\n" - "A3 etat T5 passé\n" - "A4 assertion T5 non-associé\n" + "A4 etat T5 passé\n" + "A5 assertion T5 non-associé\n" "T6 pathologie 159 164 rhume\n" - "A5 etat T6 présent\n" - "A6 assertion T6 hypothétique\n" + "A6 etat T6 présent\n" + "A7 assertion T6 hypothétique\n" "T7 pathologie 291 296 rhume\n" - "A7 etat T7 présent\n" - "A8 assertion T7 hypothétique\n" + "A8 etat T7 présent\n" + "A9 assertion T7 hypothétique\n" "T8 sosy 306 314 Douleurs\n" "T9 localisation 315 333 dans le bras droit\n" "T10 anatomie 323 333 bras droit\n" "T11 sosy 378 386 anomalie\n" - "A9 assertion T11 absent\n" + "A10 assertion T11 absent\n" + "T12 test label 0 378 386 anomalie\n" + "A11 assertion T12 absent\n" ) @@ -228,15 +232,22 @@ def test_brat( def test_read_to_standoff(blank_nlp, tmpdir): input_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data" output_dir = Path(tmpdir) - doc = list(edsnlp.data.read_standoff(input_dir))[0] + doc = list(edsnlp.data.read_standoff(input_dir, bool_attributes=["bool flag 0"]))[0] assert_doc_read(doc) doc.ents[0]._.etat = "test" edsnlp.data.write_standoff( [doc], output_dir, - span_attributes=["etat", "assertion"], - span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"], + span_attributes=["etat", "assertion", "bool flag 0"], + span_getter=[ + "ents", + "sosy", + "localisation", + "anatomie", + "pathologie", + "test label 0", + ], ) with open(output_dir / "subfolder" / "doc-1.ann") as f: diff --git a/tests/resources/brat_data/subfolder/doc-1.ann b/tests/resources/brat_data/subfolder/doc-1.ann index b9bc7af1e..5d9e3795e 100644 --- a/tests/resources/brat_data/subfolder/doc-1.ann +++ b/tests/resources/brat_data/subfolder/doc-1.ann @@ -4,6 +4,7 @@ T2 localisation 39 57 dans le bras droit T3 anatomie 47 57 bras droit T4 pathologie 75 83;85 98 problème de locomotion A1 assertion T4 absent +A9 bool flag 0 T4 T5 pathologie 114 117 AVC A2 etat T5 passé A3 assertion T5 non-associé @@ -22,3 +23,4 @@ R2 lieu Arg1:T1 Arg2:T2 A8 assertion T11 absent E1 MyArg1:T3 MyArg2:T1 E2 MyArg1:T1 MyArg2:E1 +T12 test label 0 378 386 anomalie